Ejemplos de convert_to_unicode en Python, ejemplos de utils.convert_to_unicode en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: data_factory.py Proyecto: mcps5601/biomedical-playground

    def __getitem__(self, idx):
        label, guid, text_a, text_b = self.df.iloc[idx, :]
        tokens_1 = self.tokenizer.tokenize(convert_to_unicode(text_a))
        tokens_2 = self.tokenizer.tokenize(convert_to_unicode(text_b))
        _truncate_seq_pair(tokens_1, tokens_2, self.max_seq_len - 3)

        # assemble [CLS]+text_a+[SEP]
        word_pieces = ["[CLS]"]
        word_pieces += tokens_1 + ["[SEP]"]
        len_1 = len(tokens_1) + 2

        # assemble [CLS]+sent1+[SEP]+text_b+[SEP]
        word_pieces += tokens_2 + ["[SEP]"]
        len_2 = len(tokens_2) + 1

        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.LongTensor(ids)
        segments_tensor = torch.LongTensor([0] * len_1 + [1] * len_2)

        # labels
        label_id = self.label_map[label]
        label_tensor = torch.tensor(label_id)
        # label_tensor = torch.IntTensor(label_id)

        return (tokens_tensor, segments_tensor, label_tensor)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: preprocess.py Proyecto: cdrhim/KoGEO

    def _create_examples(self, lines, base = False):
        examples = []
        # equation = 1

        for n,data in enumerate(lines):
            # lines: id, summary, text
            # if n > 0:
                guid = data[0].replace('\ufeff','')
                src = convert_to_unicode(data[1].replace('N','N ').replace('C','C '))
                tgt = convert_to_unicode(data[2]).replace('N','N ').replace('C','C ')

                try:
                    self.answers[int(guid)] = \
                    list(set([float(answer) for answer in data[4].split(' ')]))
                except:
                    self.answers[int(guid)] = ''
                temnum = {}
                for dn,number in enumerate(data[5].split(' ')):
                    temnum['n{}'.format(dn)]= number
                self.numbers[int(guid)] = temnum
                self.false_equations[int(guid)] = []

                if not base:
                    dep = convert_to_unicode(data[3])
                    numdep = convert_to_unicode(data[-1])
                    examples.append(InputExample(guid=guid, src=src, tgt=tgt,
                                                 dep=[int(f) for f in dep.split(' ') if f != ''] + [-10],
                                                 numdep=[int(f) for f in numdep.split(' ') if f != '']))
                else:
                    examples.append(InputExample(guid=guid, src=src, tgt=tgt))
        return examples

Ejemplo n.º 3

0

Mostrar archivo

Archivo: preprocess.py Proyecto: sahara2001/BerT4Summary

 def _create_examples(self, lines):
     examples = [] 
     for data in lines:
         # lines: id, summary, text
         guid = data[0]
         src = convert_to_unicode(data[2])
         tgt = convert_to_unicode(data[1])
         examples.append(InputExample(guid=guid, src=src, tgt=tgt))
     return examples

Ejemplo n.º 4

0

Mostrar archivo

Archivo: host.py Proyecto: meganeshkadam/kolla-cli

    def take_action(self, parsed_args):
        try:
            hostname = None
            if parsed_args.hostname:
                hostname = parsed_args.hostname.strip()
                hostname = utils.convert_to_unicode(hostname)

            inventory = Inventory.load()

            if hostname:
                host = inventory.get_host(hostname)
                if not host:
                    _host_not_found(self.log, hostname)

            data = []
            host_groups = inventory.get_host_groups()
            if host_groups:
                if hostname:
                    data.append((hostname, host_groups[hostname]))
                else:
                    for (hostname, groupnames) in host_groups.items():
                        data.append((hostname, groupnames))
            else:
                data.append(('', ''))
            return (('Host', 'Groups'), sorted(data))
        except CommandError as e:
            raise e
        except Exception as e:
            raise Exception(traceback.format_exc())

Ejemplo n.º 5

0

Mostrar archivo

Archivo: process.py Proyecto: uestc-chen/CWS_Dict-rewrite

 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         guid = "%s-%s" % (set_type, i)
         text = utils.convert_to_unicode(line.strip())
         labels = self._labels_words(text)
         text = re.sub(u'\s+', '', text.strip())
         examples.append(InputExample(guid=guid, text=text, labels=labels))
     return examples

Ejemplo n.º 6

0

Mostrar archivo

Archivo: host.py Proyecto: meganeshkadam/kolla-cli

 def take_action(self, parsed_args):
     try:
         hostname = parsed_args.hostname.strip()
         hostname = utils.convert_to_unicode(hostname)
         inventory = Inventory.load()
         inventory.remove_host(hostname)
         Inventory.save(inventory)
     except CommandError as e:
         raise e
     except Exception as e:
         raise Exception(traceback.format_exc())

Ejemplo n.º 7

0

Mostrar archivo

Archivo: host.py Proyecto: meganeshkadam/kolla-cli

    def take_action(self, parsed_args):
        try:
            if not parsed_args.hostname and not parsed_args.file:
                raise CommandError('Hostname or hosts info file path ' +
                                   'is required')
            if parsed_args.hostname and parsed_args.file:
                raise CommandError('Hostname and hosts info file path ' +
                                   'cannot both be present')
            inventory = Inventory.load()

            if parsed_args.file:
                # multi-host setup via xml file
                hosts_data = self.get_yml_data(parsed_args.file.strip())
                inventory.setup_hosts(hosts_data)
            else:
                # single host setup
                hostname = parsed_args.hostname.strip()
                hostname = utils.convert_to_unicode(hostname)
                if not inventory.get_host(hostname):
                    _host_not_found(self.log, hostname)

                check_ok = inventory.check_host(hostname, True)
                if check_ok:
                    self.log.info(
                        'Skipping setup of host (%s) as check is ok' %
                        hostname)
                    return True

                if parsed_args.insecure:
                    password = parsed_args.insecure.strip()
                else:
                    setup_user = get_setup_user()
                    password = getpass.getpass('%s password for %s: ' %
                                               (setup_user, hostname))
                password = utils.convert_to_unicode(password)
                inventory.setup_host(hostname, password)

        except CommandError as e:
            raise e
        except Exception as e:
            raise Exception(traceback.format_exc())

Ejemplo n.º 8

0

Mostrar archivo

Archivo: data_factory.py Proyecto: mcps5601/biomedical-playground

    def __getitem__(self, idx):
        sent1, sent2, score = self.df.iloc[idx, 7:10]
        score_tensor = torch.tensor(score, dtype=torch.float32)

        tokens_1 = self.tokenizer.tokenize(convert_to_unicode(sent1))
        tokens_2 = self.tokenizer.tokenize(convert_to_unicode(sent2))
        _truncate_seq_pair(tokens_1, tokens_2, self.max_seq_len - 3)

        # assemble [CLS]+sent1+[SEP]
        word_pieces = ["[CLS]"]
        word_pieces += tokens_1 + ["[SEP]"]
        len_1 = len(tokens_1) + 2

        # assemble [CLS]+sent1+[SEP]+sent2+[SEP]
        word_pieces += tokens_2 + ["[SEP]"]
        len_2 = len(tokens_2) + 1

        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.LongTensor(ids)
        segments_tensor = torch.LongTensor([0] * len_1 + [1] * len_2)

        return (tokens_tensor, segments_tensor, score_tensor)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: host.py Proyecto: meganeshkadam/kolla-cli

    def take_action(self, parsed_args):
        try:
            hostname = parsed_args.hostname.strip()
            hostname = utils.convert_to_unicode(hostname)
            inventory = Inventory.load()
            if not inventory.get_host(hostname):
                _host_not_found(self.log, hostname)

            inventory.check_host(hostname)
        except CommandError as e:
            raise e
        except Exception as e:
            raise Exception(traceback.format_exc())

Ejemplo n.º 10

0

Mostrar archivo

def trans_type(file, is_train):

    if is_train == 'test':
        sentences = []
        with open(file) as reader:
            data = json.load(reader)

            for entry in data:
                text = entry['text']
                sentences.append(text)
        reader.close()
        with open(data_dir + is_train + '_sentences.txt', 'w') as writer:
            for sentence in sentences:
                writer.write(sentence + '\n')
        writer.close()
    else:
        with open(file) as reader:
            data = json.load(reader)
            sentences = []
            intents = []
            domains = []
            role_labels = []
            for entry in data:
                text = entry['text']
                text = utils.convert_to_unicode(text)
                domain = entry['domain']
                intent = entry['intent']
                slots = entry['slots']
                sentences.append(text)
                domains.append(domain)
                intents.append(intent)
                sentence_roled = sentence_role(text, slots)  # label sentence using slots
                role_labels.append(sentence_roled)
        reader.close()

        with open(data_dir + is_train + '_''sentences.txt', 'w') as writer:
            for sentence in sentences:
                writer.write(sentence + '\n')
        writer.close()
        with open(data_dir + is_train + '_' + 'intents.txt', 'w') as writer:
            for intent in intents:
                writer.write(str(intent) + '\n')
        writer.close()
        with open(data_dir + is_train + '_' + 'domains.txt', 'w') as writer:
            for domain in domains:
                writer.write(domain + '\n')
        writer.close()
        with open(data_dir + is_train + '_' + 'role_labels.txt', 'w') as writer:
            for role_label in role_labels:
                writer.write(role_label + '\n')
        writer.close()

Ejemplo n.º 11

0

Mostrar archivo

Archivo: data_processor.py Proyecto: Jacob-Zhou/CWSwD-PL

    def _create_examples(lines, set_type=None):
        """Creates examples for the training and dev sets."""
        # re_ENUM = re.compile(r"([-.a-zA-Z0-9]+)")
        re_ENUM = re.compile(r'(([-–+])?\d+(([.·])\d+)?%?|([0-9_.·]*[A-Za-z]+[0-9_.·]*)+)')
        converter = opencc.OpenCC('t2s')

        def _labels_words(p_text_segment):
            inside_tokens = []
            inside_labels = []
            for segment in p_text_segment:
                hyper_tokens = segment.split()
                segment_tokens = []
                for hyper_token in hyper_tokens:
                    hyper_token = hyper_token.strip()
                    if len(hyper_token) > 0:
                        is_chinese = False
                        for c in hyper_token:
                            if process.process_utils.is_cjk_char(ord(c)):
                                is_chinese = True
                                break
                        if is_chinese:
                            segment_tokens.extend(list(hyper_token))
                        else:
                            segment_tokens.append(hyper_token)

                inside_tokens.extend(segment_tokens)
                if len(segment_tokens) == 1:
                    inside_labels.extend(["A"])
                elif len(segment_tokens) > 1:
                    inside_labels.extend(["BS"] + ["A"] * (len(segment_tokens) - 2) + ["ES"])

            return inside_tokens, inside_labels

        for (i, line) in enumerate(lines):
            # Only the test set has a header
            line = convert_to_unicode(line.strip())
            text = str.lower(process.process_utils.strQ2B(line))
            text = converter.convert(text)
            text = re_ENUM.sub(" \\1 ", text)
            text_segment = text.split("☃")
            tokens, labels = _labels_words(text_segment)
            o_text = re.sub(r"\s|☃", "", line)
            offset = 0
            o_tokens = []
            for token in tokens:
                o_tokens.append(o_text[offset: offset + len(token)])
                offset += len(token)
            yield InputExample(guid=o_tokens, text=tokens, labels=labels)

Ejemplo n.º 12

0

Mostrar archivo

 def load_dict(dictionary_files):
     """Loads a vocabulary file into a dictionary."""
     dictionary = collections.OrderedDict()
     dictionary_files = dictionary_files.split(",")
     for dictionary_file in dictionary_files:
         if not str.isspace(dictionary_file):
             with tf.gfile.GFile(dictionary_file, "r") as reader:
                 while True:
                     token = utils.convert_to_unicode(reader.readline())
                     if not token:
                         break
                     token = token.strip().split(" ")
                     if len(token) == 2:
                         dictionary[token[0]] = token[1]
                     else:
                         dictionary[token[0]] = 1
     return dictionary

Ejemplo n.º 13

0

Mostrar archivo

Archivo: data_processor.py Proyecto: Jacob-Zhou/CWSwD-PL

 def _create_examples(lines, set_type):
     """Creates examples for the training and dev sets."""
     tf.logging.info(f"creating {set_type} examples")
     text = []
     labels = []
     for line in lines:
         # Only the test set has a header
         o_line = line
         line = utils.convert_to_unicode(strQ2B(line.strip()))
         char_info = line.split()
         if len(char_info) != 3:
             if len(char_info) != 0:
                 raise ValueError(o_line)
             if len(text) != 0:
                 yield InputExample(guid="", text=text, labels=labels)
             text = []
             labels = []
         else:
             text.append(char_info[0].strip())
             labels.append(char_info[2].strip())

Ejemplo n.º 14

0

Mostrar archivo

def load_vocab(vocab_files, preserve_token=None):
    """Loads a vocabulary file into a dictionary."""
    if preserve_token is None:
        preserve_token = []
    vocab = collections.OrderedDict()
    index = 0
    if preserve_token is not None:
        for token in preserve_token:
            vocab[token] = index
            index += 1
    vocab_files = vocab_files.split(",")
    for vocab_file in vocab_files:
        with tf.gfile.GFile(vocab_file, "r") as reader:
            while True:
                token = utils.convert_to_unicode(reader.readline())
                if not token:
                    break
                token = token.strip()
                if token not in vocab:
                    vocab[token] = index
                    index += 1
    return vocab

Ejemplo n.º 15

0

Mostrar archivo

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = utils.convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens