Python to_unicode Examples, util.to_unicode Python Examples

Example #1

0

Show file

File: sources.py Project: piepmatz/old-memories-new-wallpaper

    def get_images_and_capture_dates(self):
        img_file_names = []

        if self.recursive:
            for root, _, file_names in os.walk(self.path):
                img_file_names += [to_unicode(os.path.join(root, file_name))
                                   for file_name in file_names if self.pattern.match(file_name)]
        else:
            img_file_names = [to_unicode(os.path.join(self.path, file_name))
                              for file_name in os.listdir(self.path)
                              if os.path.isfile(os.path.join(self.path, file_name)) and self.pattern.match(file_name)]

        capture_times = []
        images_with_times = []
        for img_file_name in img_file_names:
            with open(img_file_name, 'rb') as img_file:
                try:
                    tags = exifread.process_file(img_file, stop_tag="DateTimeOriginal", details=False)
                except:
                    continue  # if a file is corrupt in any way, skip it
                date = tags.get("EXIF DateTimeOriginal") or tags.get("EXIF DateTimeDigitized") \
                       or tags.get("Image DateTime")
                if not date:
                    continue  # skip images without info when they were taken
                try:
                    # EXIF date format is YYYY:MM:DD HH:MM:SS
                    date = datetime.strptime(date.printable, "%Y:%m:%d %H:%M:%S").date()
                except ValueError:
                    continue  # skip image if parsing the date fails

                capture_times.append(date)
                images_with_times.append(img_file_name)

        return images_with_times, capture_times

Example #2

0

Show file

File: reply.py Project: qq40660/wx

 def to_xml(self):
     args = (
         self.tosuer,
         self.fromuser,
         self.create_time,
         self.msg_type,
         to_unicode(self.music_title),
         to_unicode(self.music_description),
         self.music_url,
         self.music_url,
         self.music_hq_url,
         self.func_flag
     )
     return self.xml_template % args

Example #3

0

Show file

File: config.py Project: pacopablo/dustbowl

    def save(self):
        """Write the configuration options to the primary file."""
        if not self.filename:
            return

        # Only save options that differ from the defaults
        sections = []
        for section in self.sections():
            options = []
            for option in self[section]:
                default = None
                if self.parent:
                    default = self.parent.get(section, option)
                current = self.parser.has_option(section, option) and \
                          to_unicode(self.parser.get(section, option))
                if current is not False and current != default:
                    options.append((option, current))
            if options:
                sections.append((section, sorted(options)))

        fileobj = open(self.filename, 'w')
        try:
            fileobj.write('# -*- coding: utf-8 -*-\n\n')
            for section, options in sections:
                fileobj.write('[%s]\n' % section)
                for key, val in options:
                    if key in self[section].overridden:
                        fileobj.write('# %s = <inherited>\n' % key)
                    else:
                        val = val.replace(CRLF, '\n').replace('\n', '\n ')
                        fileobj.write('%s = %s\n' % (key, val.encode('utf-8')))
                fileobj.write('\n')
        finally:
            fileobj.close()

Example #4

0

Show file

File: job.py Project: hephaestus9/Ironworks

    def __init__(self, trigger, func, args, kwargs, misfire_grace_time,
                 coalesce, name=None, max_runs=None, max_instances=1):
        if not trigger:
            raise ValueError('The trigger must not be None')
        if not hasattr(func, '__call__'):
            raise TypeError('func must be callable')
        if not hasattr(args, '__getitem__'):
            raise TypeError('args must be a list-like object')
        if not hasattr(kwargs, '__getitem__'):
            raise TypeError('kwargs must be a dict-like object')
        if misfire_grace_time <= 0:
            raise ValueError('misfire_grace_time must be a positive value')
        if max_runs is not None and max_runs <= 0:
            raise ValueError('max_runs must be a positive value')
        if max_instances <= 0:
            raise ValueError('max_instances must be a positive value')

        self._lock = Lock()

        self.trigger = trigger
        self.func = func
        self.args = args
        self.kwargs = kwargs
        self.name = to_unicode(name or get_callable_name(func))
        self.misfire_grace_time = misfire_grace_time
        self.coalesce = coalesce
        self.max_runs = max_runs
        self.max_instances = max_instances
        self.runs = 0
        self.instances = 0

Example #5

0

Show file

File: batch_inference.py Project: nipengmath/ctr_tf

    def prepro(self, contexts):
        num = len(contexts)

        context_tokens = []
        for text in contexts:
            text = to_unicode(text)
            lst = re.split(r",|\?|!|。|，|？|！", text)
            tokens = []
            for x in lst:
                para_tokens = word_tokenize(x)
                tokens.append(para_tokens)
            context_tokens.append(tokens)

        context_idxs = np.zeros([self.batch_size, self.para_max_num, self.para_max_length], dtype=np.int32)

        def _get_word(each):
            if each in self.word2idx_dict:
                return self.word2idx_dict[each]
            return 1

        for b, context_token in enumerate(context_tokens):
            for i, tokens in enumerate(context_token):
                if i < para_max_num:
                    for j, token in enumerate(tokens):
                        if j < para_max_length:
                            context_idxs[b, i,j] = _get_word(token)
        return context_idxs

Example #6

0

Show file

File: xform.py Project: 631068264/fs_web

 def _check(self, value):
     valid = (value and value.startswith('1') and value.isdigit()
              and len(util.to_unicode(value)) == 11)
     if valid:
         data = value
     else:
         data = self._messages['default']
     return valid, data

Example #7

0

Show file

File: xform.py Project: 631068264/fs_web

 def __init__(self, field_name=None, default_value=None):
     self._messages = {}
     self._messages.update(_default_messages)
     self._message_vars = {
         'name': util.to_unicode(field_name),
     }
     if default_value is not None:
         self._default_value = default_value

Example #8

0

Show file

File: xform.py Project: 631068264/webcam_python

 def __init__(self, field_name=None, default_value=None):
     self._messages = {}
     self._messages.update(_default_messages)
     self._message_vars = {
         'name': util.to_unicode(field_name),
     }
     if default_value is not None:
         self._default_value = default_value

Example #9

0

Show file

File: xform.py Project: 631068264/webcam_python

 def _check(self, value):
     valid = (value and
              value.startswith('1') and
              value.isdigit() and
              len(util.to_unicode(value)) == 11)
     if valid:
         data = value
     else:
         data = self._messages['default']
     return valid, data

Example #10

0

Show file

File: xform.py Project: 631068264/webcam_python

    def _check(self, value):
        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        if self._format:
            import re
            if not re.match(self._format, value):
                return False, self._messages['format']
        return True, value

Example #11

0

Show file

File: xform.py Project: 631068264/fs_web

    def _check(self, value):
        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        if self._format:
            import re
            if not re.match(self._format, value):
                return False, self._messages['format']
        return True, value

Example #12

0

Show file

File: reply.py Project: qq40660/wx

 def to_xml(self):
     args = (
         self.touser,
         self.fromuser,
         self.create_time,
         self.msg_type,
         0,
         to_unicode(self.content)
     )
     return self.xml_template % args

Example #13

0

Show file

File: mining_server.py Project: shenxiangq/news_crawler

 def process_body(self, body, url, obj_id):
     body = to_unicode(body)
     body.replace('<?xml version="1.0" encoding="utf-8"?>', "")
     body = self.cleaner.clean_html(body)
     with open("../data/mining_task/" + str(obj_id), "wb") as fout:
         g = gzip.GzipFile(mode="wb", fileobj=fout)
         try:
             g.write(body.encode("utf-8"))
         finally:
             g.close()
     print url

Example #14

0

Show file

File: xform.py Project: 631068264/fs_web

    def _check(self, value):
        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        valid = value.replace('-', '').replace(' ', '').isdigit()
        if not valid:
            data = self._messages['default']
        else:
            data = value
        return valid, data

Example #15

0

Show file

File: xform.py Project: 631068264/webcam_python

    def _check(self, value):
        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        valid = value.replace('-', '').replace(' ', '').isdigit()
        if not valid:
            data = self._messages['default']
        else:
            data = value
        return valid, data

Example #16

0

Show file

File: task.py Project: qq40660/wx

    def get(self):
        users = WXUser.all()
        p = Push()
        if not users.count():
            return
        opener = poster.streaminghttp.register_openers()
        weatherinfo = json.loads(opener.open(settings.weather1_url % settings.weather_city, timeout=5).read())['weatherinfo']
        logging.info(weatherinfo)
        city = weatherinfo['city']
        temp = weatherinfo['temp']
        wd = weatherinfo['WD']
        ws = weatherinfo['WS']
        sd = weatherinfo['WS']
        time = weatherinfo['time']
        args = (to_unicode(city), temp, to_unicode(wd), to_unicode(ws), sd, time)
        logging(str(args))
        for user in users:
            msg = '''
城市：%s
温度：%s 摄氏度
风向：%s
风力：%s
湿度：%s
发布时间：%s''' % (to_unicode(city), temp, to_unicode(wd), to_unicode(ws), sd, time)
            logging.info(msg)
            p.send_txt_msg(user.fake_id, msg)

Example #17

0

Show file

File: config.py Project: pacopablo/dustbowl

    def set(self, name, value):
        """Change a configuration value.

        These changes are not persistent unless saved with `save()`.
        """
        if not self.config.parser.has_section(self.name):
            self.config.parser.add_section(self.name)
        if value is None:
            self.overridden[name] = True
            value = ''
        else:
            value = to_unicode(value).encode('utf-8')
        return self.config.parser.set(self.name, name, value)

Example #18

0

Show file

File: message.py Project: qq40660/wx

def parse_message(xml):
    '''Parse from weixin receive xml to message '''
    if not xml:
        return
    logging.info(xml)
    root = et.fromstring(xml)
    _msg = dict(
        touser=root.find('ToUserName').text,
        fromuser=root.find('FromUserName').text,
        create_time=root.find('CreateTime').text
    )
    msg_type = root.find('MsgType').text
    if msg_type == 'text':
        _msg['content'] = to_unicode(root.find('Content').text)
        _msg['msg_id'] = root.find('MsgId').text
        return TextMessage(**_msg)
    elif msg_type == 'image':
        _msg['pic_url'] = root.find('PicUrl').text
        _msg['msg_id'] = root.find('MsgId').text
        return ImageMessage(**_msg)
    elif msg_type == 'location':
        _msg['x'] = root.find('Location_x').text
        _msg['y'] = root.find('Location_y').text
        _msg['scale'] = root.find('Scale').text
        _msg['label'] = to_unicode(root.find('Label').text)
        _msg['msg_id'] = root.find('MsgId').text
        return LocationMessage(**_msg)
    elif msg_type == 'link':
        _msg['title'] = to_unicode(root.find('Title').text)
        _msg['description'] = to_unicode(root.find('Description').text)
        _msg['url'] = root.find('Url').text
        _msg['msg_id'] = root.find('MsgId').text
        return LinkMessage(**_msg)
    elif msg_type == 'event':
        _msg['event'] = root.find('Event').text
        _msg['event_key'] = root.find('EventKey').text
        _msg['msg_id'] = None
        return EventMessage(**_msg)

Example #19

0

Show file

File: xform.py Project: 631068264/fs_web

    def _check(self, value):
        if self._max is None:
            self._max = self._max_urs_length
            self._message_vars['max_len'] = self._max
        if self._min is None:
            self._min = self._min_urs_length
            self._message_vars['min_len'] = self._min

        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        value = value.lower()
        return self.is_email(value)

Example #20

0

Show file

File: xform.py Project: 631068264/webcam_python

    def _check(self, value):
        if self._max is None:
            self._max = self._max_urs_length
            self._message_vars['max_len'] = self._max
        if self._min is None:
            self._min = self._min_urs_length
            self._message_vars['min_len'] = self._min

        if not self._check_mm(len(util.to_unicode(value))):
            message = self._messages[self._message_key + '_len']
            return False, message

        value = value.lower()
        return self.is_email(value)

Example #21

0

Show file

File: wx.py Project: qq40660/wx

 def post(self):
     global token
     _args = dict(
         token=token,
         timestamp=self.request.get('timestamp'),
         nonce=self.request.get('nonce'),
         signature=self.request.get('signature')
     )
     if not checkSignure(**_args):
         return webapp2.abort(403)
     message = parse_message(self.request.body)
     reply = generate_reply(message)
     self.response.content_type = 'application/xml'
     self.response.write(to_unicode(reply.to_xml()))

Example #22

0

Show file

File: mining_crawler.py Project: shenxiangq/news_crawler

 def process_body(self, body, task):
     url = task.get('url')
     #print url, body[:100][:1000]
     body_size = len(body)
     body = to_unicode(body)
     body.replace('<?xml version="1.0" encoding="utf-8"?>', '')
     #body = self.cleaner.clean_html(body)
     self.logger.info("page body, url:%s, body:%s" % (url, body[:100]))
     self.db_helper.save_mining_result(body, body_size, task)
     if task.get('depth') <= self.maxdepth:
         tree = lxml.html.document_fromstring(body)
         a_elements = tree.xpath('//a')
         #import pdb;pdb.set_trace()
         urls = valid_a_href(a_elements, url)
         not_exist = self.url_dedup.insert_not_exist(urls)
         #self.db_helper.insert_mining_task(task, urls)
         self.db_helper.insert_mining_task(task, not_exist)

Example #23

0

Show file

def get_embedding(counter,
                  data_type,
                  limit=-1,
                  emb_file=None,
                  vec_size=None,
                  token2idx_dict=None):
    print("Generating {} embedding...{}".format(data_type, emb_file))
    embedding_dict = {}
    filtered_elements = [k for k, v in counter.items() if v > limit]
    if emb_file is not None:
        assert vec_size is not None
        with codecs.open(emb_file, "r", encoding="utf-8") as fh:
            for line in fh:
                array = to_unicode(line).strip().split()
                word = "".join(array[0:-vec_size])
                vector = list(map(float, array[-vec_size:]))
                if word in counter and counter[word] > limit:
                    embedding_dict[word] = vector
        print("{} / {} tokens have corresponding {} embedding vector".format(
            len(embedding_dict), len(filtered_elements), data_type))
    else:
        assert vec_size is not None
        for token in filtered_elements:
            embedding_dict[token] = [
                np.random.normal(scale=0.01) for _ in range(vec_size)
            ]
        print("{} tokens have corresponding embedding vector".format(
            len(filtered_elements)))

    NULL = "--NULL--"
    OOV = "--OOV--"
    token2idx_dict = {
        token: idx
        for idx, token in enumerate(embedding_dict.keys(), 2)
    } if token2idx_dict is None else token2idx_dict
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = 1
    embedding_dict[NULL] = [0. for _ in range(vec_size)]
    embedding_dict[OOV] = [0. for _ in range(vec_size)]
    idx2emb_dict = {
        idx: embedding_dict[token]
        for token, idx in token2idx_dict.items()
    }
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
    return emb_mat, token2idx_dict

Example #24

0

Show file

File: config.py Project: pacopablo/dustbowl

    def get(self, name, default=''):
        """Return the value of the specified option.

        Valid default input is a string. Returns a string.
        """
        if self.config.parser.has_option(self.name, name):
            value = self.config.parser.get(self.name, name)
        elif self.config.parent:
            value = self.config.parent[self.name].get(name, default)
        else:
            option = Option.registry.get((self.name, name))
            if option:
                value = option.default or default
            else:
                value = default
        if not value:
            return u''
        elif isinstance(value, basestring):
            return to_unicode(value)
        else:
            return value

Example #25

0

Show file

File: job.py Project: hephaestus9/Ironworks

    def __init__(self,
                 trigger,
                 func,
                 args,
                 kwargs,
                 misfire_grace_time,
                 coalesce,
                 name=None,
                 max_runs=None,
                 max_instances=1):
        if not trigger:
            raise ValueError('The trigger must not be None')
        if not hasattr(func, '__call__'):
            raise TypeError('func must be callable')
        if not hasattr(args, '__getitem__'):
            raise TypeError('args must be a list-like object')
        if not hasattr(kwargs, '__getitem__'):
            raise TypeError('kwargs must be a dict-like object')
        if misfire_grace_time <= 0:
            raise ValueError('misfire_grace_time must be a positive value')
        if max_runs is not None and max_runs <= 0:
            raise ValueError('max_runs must be a positive value')
        if max_instances <= 0:
            raise ValueError('max_instances must be a positive value')

        self._lock = Lock()

        self.trigger = trigger
        self.func = func
        self.args = args
        self.kwargs = kwargs
        self.name = to_unicode(name or get_callable_name(func))
        self.misfire_grace_time = misfire_grace_time
        self.coalesce = coalesce
        self.max_runs = max_runs
        self.max_instances = max_instances
        self.runs = 0
        self.instances = 0

Example #26

0

Show file

def get_by_city_code(city_code='101110101'):
    """default city_code is 北京 """
    remote_url = URL.format(city_code)
    response = ul.urlopen(remote_url).read()
    response = util.to_unicode(response)

    try:
        data = json.loads(response)
        weather_info = data['weatherinfo']

        cur = get_current()
        temp = 'temp%d' % (cur)
        weather = 'weather%d' % (cur)

        re = {
            'date': weather_info['date_y'],
            'city': weather_info['city'],
            'temp': weather_info[temp],
            'weather': weather_info[weather],
            'tip': weather_info['index_d']
        }
    except ValueError:
        re = None
    return re

Example #27

0

Show file

File: weather.py Project: lfyuan13/crawler

def get_by_city_code(city_code='101110101'):
    """default city_code is 北京 """
    remote_url = URL.format(city_code)
    response = ul.urlopen(remote_url).read()
    response = util.to_unicode(response)
    
    try:
        data = json.loads(response)
        weather_info = data['weatherinfo']

        cur = get_current()
        temp = 'temp%d' % (cur)
        weather = 'weather%d' % (cur)

        re = {
            'date' : weather_info['date_y'],
            'city' : weather_info['city'],
            'temp' : weather_info[temp],
            'weather' : weather_info[weather],
            'tip' : weather_info['index_d']
        }
    except ValueError:
        re = None
    return re

Example #28

0

Show file

File: weather.py Project: lfyuan13/crawler

def get_by_city_name(city=u'北京'):
    if isinstance(city, dict):
        city = city['where']
    city = util.to_unicode(city)
    return get_by_city_code(city_dict[city])

Example #29

0

Show file

File: xform.py Project: 631068264/webcam_python

 def _check(self, value):
     if not self._check_mm(len(util.to_unicode(value))):
         message = self._messages[self._message_key + '_len']
         return False, message
     return self.is_email(value)

Example #30

0

Show file

File: main.py Project: Avatarchik/Skyperious

def run(nogui=False):
    """Parses command-line arguments and either runs GUI, or a CLI action."""
    global is_cli, is_gui_possible, is_verbose

    if (getattr(sys, 'frozen', False) # Binary application
    or sys.executable.lower().endswith("pythonw.exe")):
        sys.stdout = ConsoleWriter(sys.stdout) # Hooks for attaching to 
        sys.stderr = ConsoleWriter(sys.stderr) # a text console
    if "main" not in sys.modules: # E.g. setuptools install, calling main.run
        srcdir = os.path.abspath(os.path.dirname(__file__))
        if srcdir not in sys.path: sys.path.append(srcdir)
        sys.modules["main"] = __import__("main")

    argparser = argparse.ArgumentParser(description=ARGUMENTS["description"])
    for arg in ARGUMENTS["arguments"]:
        argparser.add_argument(*arg.pop("args"), **arg)
    subparsers = argparser.add_subparsers(dest="command")
    for cmd in ARGUMENTS["commands"]:
        kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"])
        subparser = subparsers.add_parser(cmd["name"], **kwargs)
        for arg in cmd["arguments"]:
            kwargs = dict((k, arg[k]) for k in arg if k != "args")
            subparser.add_argument(*arg["args"], **kwargs)

    if "nt" == os.name: # Fix Unicode arguments, otherwise converted to ?
        sys.argv[:] = win32_unicode_argv()
    argv = sys.argv[1:]
    if not argv or (argv[0] not in subparsers.choices
    and argv[0].endswith(".db")):
        argv[:0] = ["gui"] # argparse hack: force default argument
    if argv[0] in ("-h", "--help") and len(argv) > 1:
        argv[:2] = argv[:2][::-1] # Swap "-h option" to "option -h"

    arguments = argparser.parse_args(argv)

    if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"):
        arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1]
        arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2]
        arguments.FILE = arguments.FILE1 + arguments.FILE2
    if arguments.FILE: # Expand wildcards to actual filenames
        arguments.FILE = sum([glob.glob(f) if "*" in f else [f]
                              for f in arguments.FILE], [])
        arguments.FILE = sorted(set(util.to_unicode(f) for f in arguments.FILE))

    if "gui" == arguments.command and (nogui or not is_gui_possible):
        argparser.print_help()
        status = None
        if not nogui: status = ("\n\nwxPython not found. %s graphical program "
                                "will not run." % conf.Title)
        sys.exit(status)
    elif "gui" != arguments.command:
        conf.load()
        is_cli = sys.modules["main"].is_cli = True
        is_verbose = sys.modules["main"].is_verbose = arguments.verbose
        # Avoid Unicode errors when printing to console.
        enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8"
        sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace")
        sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace")

    if "diff" == arguments.command:
        run_diff(*arguments.FILE)
    elif "merge" == arguments.command:
        run_merge(arguments.FILE, arguments.output)
    elif "export" == arguments.command:
        run_export(arguments.FILE, arguments.type, arguments.chat,
                   arguments.author, arguments.ask_password)
    elif "search" == arguments.command:
        run_search(arguments.FILE, arguments.QUERY)
    elif "gui" == arguments.command:
        run_gui(arguments.FILE)

Example #31

0

Show file

    def __init__(self, article, abstract_sentences, all_abstract_sentences,
                 doc_indices, raw_article_sents, vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

        Args:
            article: source text; a string. each token is separated by a single space.
            abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # Process the article
        article_words = article.split()
        if len(article_words) > hps.max_enc_steps:
            article_words = article_words[:hps.max_enc_steps]
        self.enc_input = [
            vocab.word2id(w) for w in article_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:

            if raw_article_sents is not None and len(raw_article_sents) > 0:
                self.tokenized_sents = [
                    process_sent(sent) for sent in raw_article_sents
                ]
                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding
            else:
                # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
                article_str = util.to_unicode(article)
                raw_article_sents = nltk.tokenize.sent_tokenize(article_str)
                self.tokenized_sents = [
                    process_sent(sent) for sent in raw_article_sents
                ]
                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        # Store the original strings
        self.original_article = article
        self.raw_article_sents = raw_article_sents
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
        self.all_original_abstract_sents = all_abstract_sentences

        self.doc_indices = doc_indices  # doc_id in multidoc correspond to each word

Example #32

0

Show file

File: app.py Project: Falldog/SudokuBoxer

import util
from os.path import join, abspath, dirname
#--------------------------- Get Version ------------------------------#
try:
    f = open('version', 'r')
    lines = f.readlines()
    v  = lines[0].strip()
    vd = lines[1].strip()
except:
    v  = '0.7'
    vd = '0101'
VERSION = v
VERSION_DATE = vd
#-----------------------------------------------------------------------#

ROOT_PATH = util.to_unicode(abspath(dirname(sys.argv[0])))
XRC_PATH = join(ROOT_PATH, 'resource', 'xrc')
LANG_PATH = join(ROOT_PATH, 'lang')
PUZZLE_PATH = join(ROOT_PATH, 'puzzle')

nCellSize = eval(util.config.get('APP', 'CellSize', '50'))
nAnswerCellSize = nCellSize*0.6
nLINE = 9
nGRID = 3
rgLINE = range(nLINE)
rgGRID = range(nGRID)

clBgFocus = '#C1DEA3'
clBgOver  = '#8FD6FF'
clBgNormal = '#EEEEEE'
clBgDefault = '#E3EDFF'

Example #33

0

Show file

    def __init__(self, article, abstract_sentences, all_abstract_sentences,
                 doc_indices, raw_article_sents, ssi, article_lcs_paths_list,
                 vocab, hps):
        """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self.

        Args:
            article: source text; a string. each token is separated by a single space.
            abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space.
            vocab: Vocabulary object
            hps: hyperparameters
        """
        self.hps = hps

        # Get ids of special tokens
        start_decoding = vocab.word2id(data.START_DECODING)
        stop_decoding = vocab.word2id(data.STOP_DECODING)

        # # Process the article
        # article_words = article.split()
        # if len(article_words) > hps.max_enc_steps:
        #     article_words = article_words[:hps.max_enc_steps]
        # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

        # Process the abstract
        abstract = ' '.join(abstract_sentences)  # string
        abstract_words = abstract.split()  # list of strings
        abs_ids = [
            vocab.word2id(w) for w in abstract_words
        ]  # list of word ids; OOVs are represented by the id for UNK token

        # Get the decoder input sequence and target sequence
        self.dec_input, self.target = self.get_dec_inp_targ_seqs(
            abs_ids, hps.max_dec_steps, start_decoding, stop_decoding)
        self.dec_len = len(self.dec_input)

        # If using pointer-generator mode, we need to store some extra info
        if hps.pointer_gen:

            if raw_article_sents is not None and len(raw_article_sents) > 0:
                # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents]
                self.tokenized_sents = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
                if self.hps.sep:
                    for sent in self.tokenized_sents[:-1]:
                        sent.append(data.SEP_TOKEN)

                # Process the article
                article_words = util.flatten_list_of_lists(
                    self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [
                    vocab.word2id(w) for w in article_words
                ]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:
                                                                              hps
                                                                              .
                                                                              max_enc_steps]
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding
            else:
                # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
                article_str = util.to_unicode(article)
                raw_article_sents = nltk.tokenize.sent_tokenize(article_str)
                self.tokenized_sents = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]

                # Process the article
                article_words = util.flatten_list_of_lists(
                    self.tokenized_sents)
                if len(article_words) > hps.max_enc_steps:
                    article_words = article_words[:hps.max_enc_steps]
                self.enc_input = [
                    vocab.word2id(w) for w in article_words
                ]  # list of word ids; OOVs are represented by the id for UNK token

                if len(all_abstract_sentences) == 1:
                    doc_indices = [0] * len(article_words)

                self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(
                    self.tokenized_sents, vocab)
                self.enc_input_extend_vocab = util.flatten_list_of_lists(
                    self.word_ids_sents)
                # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab)
                if len(self.enc_input_extend_vocab) > hps.max_enc_steps:
                    self.enc_input_extend_vocab = self.enc_input_extend_vocab[:
                                                                              hps
                                                                              .
                                                                              max_enc_steps]
                self.enc_len = len(
                    self.enc_input_extend_vocab
                )  # store the length after truncation but before padding

            if self.hps.word_imp_reg:
                self.enc_importances = self.get_enc_importances(
                    self.tokenized_sents, abstract_words)

            # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
            abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab,
                                                     self.article_oovs)

            # Overwrite decoder target sequence so it uses the temp article OOV ids
            _, self.target = self.get_dec_inp_targ_seqs(
                abs_ids_extend_vocab, hps.max_dec_steps, start_decoding,
                stop_decoding)

        if ssi is not None:
            # Translate the similar source indices into masks over the encoder input
            self.ssi_masks = []
            for source_indices in ssi:
                ssi_sent_mask = [0.] * len(raw_article_sents)
                for source_idx in source_indices:
                    if source_idx >= len(ssi_sent_mask):
                        a = 0
                    ssi_sent_mask[source_idx] = 1.
                ssi_mask = pg_mmr_functions.convert_to_word_level(
                    ssi_sent_mask, self.tokenized_sents)
                self.ssi_masks.append(ssi_mask)

            summary_sent_tokens = [
                sent.strip().split() for sent in abstract_sentences
            ]
            if self.hps.ssi_data_path is None and len(
                    self.ssi_masks) != len(summary_sent_tokens):
                raise Exception(
                    'len(self.ssi_masks) != len(summary_sent_tokens)')

            self.sent_indices = pg_mmr_functions.convert_to_word_level(
                list(range(len(summary_sent_tokens))),
                summary_sent_tokens).tolist()

        if article_lcs_paths_list is not None:
            if len(article_lcs_paths_list) > 1:
                raise Exception('Need to implement for non-sent_dataset')
            article_lcs_paths = article_lcs_paths_list[0]
            imp_mask = [0] * len(article_words)
            to_add = 0
            for source_idx, word_indices_list in enumerate(article_lcs_paths):
                if source_idx > 0:
                    to_add += len(self.tokenized_sents[source_idx - 1])
                for word_idx in word_indices_list:
                    if word_idx + to_add >= len(imp_mask):
                        if len(imp_mask) == hps.max_enc_steps:
                            continue
                        else:
                            print(self.tokenized_sents, article_lcs_paths)
                            raise Exception(
                                'word_idx + to_add (%d) is larger than imp_mask size (%d)'
                                % (word_idx + to_add, len(imp_mask)))
                    imp_mask[word_idx + to_add] = 1
            self.importance_mask = imp_mask

        # Store the original strings
        self.original_article = article
        self.raw_article_sents = raw_article_sents
        self.original_abstract = abstract
        self.original_abstract_sents = abstract_sentences
        self.all_original_abstract_sents = all_abstract_sentences

        self.doc_indices = doc_indices
        self.ssi = ssi
        self.article_lcs_paths_list = article_lcs_paths_list

Example #34

0

Show file

File: http_threadpool.py Project: shenxiangq/news_crawler

 def print_body(body):
     body = to_unicode(body)
     print body[:500]

Example #35

0

Show file

File: http_reactor_test.py Project: shenxiangq/news_crawler

def process_body(body, url):
    import pdb;pdb.set_trace()
    body = to_unicode(body)
    doc = lxml.html.fromstring(body)

Example #36

0

Show file

File: xform.py Project: 631068264/fs_web

 def _check(self, value):
     if not self._check_mm(len(util.to_unicode(value))):
         message = self._messages[self._message_key + '_len']
         return False, message
     return self.is_email(value)

Example #37

0

Show file

def get_by_city_name(city=u'北京'):
    if isinstance(city, dict):
        city = city['where']
    city = util.to_unicode(city)
    return get_by_city_code(city_dict[city])

Example #38

0

Show file

File: justifiedbool.py Project: scottywz/AppBackup

 def __init__(self, value=False, reason=None):
  self.__bool = bool(value)
  self.__reason = to_unicode(reason) if reason else ""

Example #39

0

Show file

File: main.py Project: barneycarroll/Skyperious

def run(argv):
    """Parses command-line arguments and either runs GUI, or a CLI action."""
    global is_cli, is_gui_possible, is_verbose

    if (getattr(sys, 'frozen', False) # Binary application
    or sys.executable.lower().endswith("pythonw.exe")):
        sys.stdout = ConsoleWriter(sys.stdout) # Hooks for attaching to 
        sys.stderr = ConsoleWriter(sys.stderr) # a text console

    argparser = argparse.ArgumentParser(description=ARGUMENTS["description"])
    for arg in ARGUMENTS["arguments"]:
        names = arg["args"]; del arg["args"]
        argparser.add_argument(*names, **arg)
    subparsers = argparser.add_subparsers(dest="command")
    for cmd in ARGUMENTS["commands"]:
        kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"])
        subparser = subparsers.add_parser(cmd["name"], **kwargs)
        for arg in cmd["arguments"]:
            kwargs = dict((k, arg[k]) for k in arg if k != "args")
            subparser.add_argument(*arg["args"], **kwargs)

    if not argv or (argv[0] not in subparsers.choices 
    and argv[0].endswith(".db")):
        argv[:0] = ["gui"] # argparse hack: force default argument
    if argv[0] in ("-h", "--help") and len(argv) > 1:
        argv[:2] = argv[:2][::-1] # Swap "-h option" to "option -h"

    arguments = argparser.parse_args(argv)

    if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"):
        arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1]
        arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2]
        arguments.FILE = arguments.FILE1 + arguments.FILE2
    if arguments.FILE: # Expand wildcards to actual filenames
        arguments.FILE = sum([(sorted(glob.glob(f)) if "*" in f else [f])
                             for f in arguments.FILE], [])
        arguments.FILE = [util.to_unicode(f) for f in arguments.FILE]

    if "gui" == arguments.command and not is_gui_possible:
        argparser.print_help()
        print("\n\nwxPython not found. %s graphical program will not run." %
              conf.Title)
        sys.exit()
    elif "gui" != arguments.command:
        is_cli = sys.modules["main"].is_cli = True
        is_verbose = sys.modules["main"].is_verbose = arguments.verbose
        enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8"
        if "nt" == os.name: # Avoid print encoding errors under windows
            sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace")
            sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace")

    if "diff" == arguments.command:
        run_diff(*arguments.FILE)
    elif "merge" == arguments.command:
        run_merge(arguments.FILE)
    elif "export" == arguments.command:
        run_export(arguments.FILE, arguments.type)
    elif "search" == arguments.command:
        run_search(arguments.FILE, arguments.QUERY)
    elif "gui" == arguments.command:
        run_gui(arguments.FILE)

Example #40

0

Show file

def run(nogui=False):
    """Parses command-line arguments and either runs GUI, or a CLI action."""
    global is_cli, is_gui_possible, is_verbose

    if (getattr(sys, 'frozen', False)  # Binary application
            or sys.executable.lower().endswith("pythonw.exe")):
        sys.stdout = ConsoleWriter(sys.stdout)  # Hooks for attaching to
        sys.stderr = ConsoleWriter(sys.stderr)  # a text console
    if "main" not in sys.modules:  # E.g. setuptools install, calling main.run
        srcdir = os.path.abspath(os.path.dirname(__file__))
        if srcdir not in sys.path: sys.path.append(srcdir)
        sys.modules["main"] = __import__("main")

    argparser = argparse.ArgumentParser(description=ARGUMENTS["description"])
    for arg in ARGUMENTS["arguments"]:
        argparser.add_argument(*arg.pop("args"), **arg)
    subparsers = argparser.add_subparsers(dest="command")
    for cmd in ARGUMENTS["commands"]:
        kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"])
        subparser = subparsers.add_parser(cmd["name"], **kwargs)
        for arg in cmd["arguments"]:
            kwargs = dict((k, arg[k]) for k in arg if k != "args")
            subparser.add_argument(*arg["args"], **kwargs)

    if "nt" == os.name:  # Fix Unicode arguments, otherwise converted to ?
        sys.argv[:] = win32_unicode_argv()
    argv = sys.argv[1:]
    if not argv or (argv[0] not in subparsers.choices
                    and argv[0].endswith(".db")):
        argv[:0] = ["gui"]  # argparse hack: force default argument
    if argv[0] in ("-h", "--help") and len(argv) > 1:
        argv[:2] = argv[:2][::-1]  # Swap "-h option" to "option -h"

    arguments = argparser.parse_args(argv)

    if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"):
        arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1]
        arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2]
        arguments.FILE = arguments.FILE1 + arguments.FILE2
    if arguments.FILE:  # Expand wildcards to actual filenames
        arguments.FILE = sum(
            [glob.glob(f) if "*" in f else [f] for f in arguments.FILE], [])
        arguments.FILE = sorted(set(
            util.to_unicode(f) for f in arguments.FILE))

    if "gui" == arguments.command and (nogui or not is_gui_possible):
        argparser.print_help()
        status = None
        if not nogui:
            status = ("\n\nwxPython not found. %s graphical program "
                      "will not run." % conf.Title)
        sys.exit(status)
    elif "gui" != arguments.command:
        conf.load()
        is_cli = sys.modules["main"].is_cli = True
        is_verbose = sys.modules["main"].is_verbose = arguments.verbose
        # Avoid Unicode errors when printing to console.
        enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8"
        sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace")
        sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace")

    if "diff" == arguments.command:
        run_diff(*arguments.FILE)
    elif "merge" == arguments.command:
        run_merge(arguments.FILE, arguments.output)
    elif "export" == arguments.command:
        run_export(arguments.FILE, arguments.type, arguments.chat,
                   arguments.author, arguments.ask_password)
    elif "search" == arguments.command:
        run_search(arguments.FILE, arguments.QUERY)
    elif "gui" == arguments.command:
        run_gui(arguments.FILE)