Beispiel #1
0
    def _build_subtoken_vocab(all_data):
        print('build subtoken vocab')

        def data_generator():
            for task_data in all_data:
                train_data, test_data = task_data
                for d in train_data + test_data:
                    yield ' '.join(d.sentence)

            for d in fudan.load_unlabeled_data():
                yield d

        def summary(vocab):
            lens = [len(vocab.encode(sentence)) for sentence in data_generator()]
            length = sorted(lens)
            length = np.asarray(length)

            max_len = np.max(length)
            avg_len = np.mean(length)
            med_len = np.median(length)
            print('max_len: {}, avg_len: {}, med_len: {}'.format(max_len, avg_len, med_len))

        encoder = SubwordTextEncoder()
        vocab_size = 2 ** 10 * FLAGS.vocab_size
        vocab = encoder.build_from_generator(data_generator(), vocab_size, 200, reserved_tokens=RESERVED_TOKENS)

        vocab_file = get_vocab_file()
        base = os.path.dirname(vocab_file)
        tf.gfile.MakeDirs(base)
        vocab.store_to_file(vocab_file)

        summary(vocab)
        return vocab
def main(args):
    subword_encoder = SubwordTextEncoder(args.vocab_file)

    record_iterator = tf.python_io.tf_record_iterator(path=args.path)
    record_basename = os.path.basename(args.path)

    with tf.python_io.TFRecordWriter(os.path.join(args.outdir,
                                                  record_basename)) as writer:
        for string_record in record_iterator:
            example = tf.train.Example()
            example.ParseFromString(string_record)

            inputs = dict(example.features.feature)['inputs']

            inputs = inputs.int64_list.value
            inputs_as_int64list = inputs
            inputs = subword_encoder.decode(inputs)

            targets = dict(example.features.feature)['targets']
            targets = targets.int64_list.value
            targets_as_int64_list = targets
            targets = subword_encoder.decode(targets)

            weights = get_weights(inputs, targets, args.weight)
            example_proto = serialize_example(inputs_as_int64list,
                                              targets_as_int64_list, weights)
            writer.write(example_proto)
def gen(path_zh, path_ru):
    random.seed(hash(path_ru) % 1000)
    tokenizer = SubwordTextEncoder(
        '../data/vocab.translate_zhru_full.47000.subwords')
    with open(path_zh, 'r') as fzh, open(path_ru, 'r') as fru:
        sample = {"inputs": [], "targets": []}
        for line_zh, line_ru in zip(fzh, fru):
            ids_zh = tokenizer.encode(line_zh.rstrip() + ' ')
            ids_ru = tokenizer.encode(line_ru.rstrip() + ' ')
            if 0 == len(sample["inputs"]) or (
                    len(sample["inputs"]) + 1 + len(ids_zh) <= MAX_LEN
                    and len(sample["targets"]) + 1 + len(ids_ru) <= MAX_LEN
                    and random.random() < 0.5):
                sample["inputs"].extend(ids_zh)
                sample["targets"].extend(ids_ru)
            else:
                sample["inputs"] = sample["inputs"][:MAX_LEN - 1] + [EOS_ID]
                sample["targets"] = sample["targets"][:MAX_LEN - 1] + [EOS_ID]
                yield sample.copy()
                sample["inputs"] = ids_zh
                sample["targets"] = ids_ru
    if sample["inputs"]:
        sample["inputs"] = sample["inputs"][:MAX_LEN - 1] + [EOS_ID]
        sample["targets"] = sample["targets"][:MAX_LEN - 1] + [EOS_ID]
        yield sample.copy()
Beispiel #4
0
def LoadorCreateVocabulary(vocab_file, dataset, vocab_size):
    try:
        subtokenizer = SubwordTextEncoder(vocab_file)
        print('Loaded existing vocabulary')
    except:
        print('Building vocabulary')
        subtokenizer = SubwordTextEncoder.build_from_generator(
            dataset, vocab_size)
        subtokenizer.store_to_file(vocab_file)
        print('Vocab File path: ', vocab_file)
    return subtokenizer
Beispiel #5
0
    def build(self, token_counts, vocab_filepath):
        target_size = self._approx_vocab_size

        # Searching the minimum max_size
        max_size = self.__INITIAL_MAX_SIZE

        while True:
            max_size, success = self._run_max_size_attempt(
                max_size, token_counts)

            if success:
                break

        min_size = 1 if max_size == self.__INITIAL_MAX_SIZE else int(max_size /
                                                                     2)

        # Generating Vocabulary file
        tf.logging.info("Generating vocab file: %s (min = %d, max = %d)" %
                        (vocab_filepath, min_size, max_size))

        encoder = SubwordTextEncoder.build_to_target_size(
            target_size,
            token_counts,
            min_size,
            max_size,
            reserved_tokens=self._reserved_tokens)

        if vocab_filepath is not None:
            encoder.store_to_file(vocab_filepath)

        return ModernMTSubwordTextEncoder(vocab_filepath)
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if os.path.exists(vocab_filepath):
    vocab = SubwordTextEncoder(vocab_filepath)
    return vocab

  tokenizer = Tokenizer()
  for source in _DATA_FILE_URLS:
    url = source[0]
    filename = os.path.basename(url)
    read_type = "r:gz" if "tgz" in filename else "r"

    compressed_file = maybe_download(tmp_dir, filename, url)

    with tarfile.open(compressed_file, read_type) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    for lang_file in source[1]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)

      # For some datasets a second extraction is necessary.
      if ".gz" in lang_file:
        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
        if os.path.exists(new_filepath):
          tf.logging.info("Subdirectory %s already exists, skipping unpacking"
                          % filepath)
        else:
          tf.logging.info("Unpacking subdirectory %s" % filepath)
          gunzip_file(filepath, new_filepath)
        filepath = new_filepath

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          _ = tokenizer.encode(line)

  vocab = SubwordTextEncoder.build_to_target_size(
      vocab_size, tokenizer.token_counts, 1, 1e3)
  vocab.store_to_file(vocab_filepath)
  return vocab
Beispiel #7
0
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
    """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
    vocab_filepath = os.path.join(tmp_dir, vocab_filename)
    if os.path.exists(vocab_filepath):
        vocab = SubwordTextEncoder(vocab_filepath)
        return vocab

    tokenizer = Tokenizer()
    for source in _DATA_FILE_URLS:
        for lang_file in source[1]:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                for line in source_file:
                    line = line.strip()
                    _ = tokenizer.encode(line)

    vocab = SubwordTextEncoder.build_to_target_size(vocab_size,
                                                    tokenizer.token_counts,
                                                    vocab_filepath, 1, 1e3)
    return vocab
Beispiel #8
0
def _build_from_token_counts(args):
    token_counts, max_size, iterations, vocab_filepath, reserved_tokens = args

    encoder = SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts,
                                    max_size,
                                    num_iterations=iterations)

    if vocab_filepath is not None:
        encoder.store_to_file(vocab_filepath)

    return max_size, encoder.vocab_size
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if os.path.exists(vocab_filepath):
    vocab = SubwordTextEncoder(vocab_filepath)
    return vocab

  tokenizer = Tokenizer()
  for source in _DATA_FILE_URLS:
    url = source[0]
    filename = os.path.basename(url)
    read_type = "r:gz" if "tgz" in filename else "r"

    compressed_file = maybe_download(tmp_dir, filename, url)

    with tarfile.open(compressed_file, read_type) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    for lang_file in source[1]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)

      # For some datasets a second extraction is necessary.
      if ".gz" in lang_file:
        tf.logging.info("Unpacking subdirectory %s" % filepath)
        new_filepath = os.path.join(tmp_dir, lang_file[:-3])
        gunzip_file(filepath, new_filepath)
        filepath = new_filepath

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          _ = tokenizer.encode(line)

  vocab = SubwordTextEncoder.build_to_target_size(
      vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3)
  return vocab
    def __init__(self, word_embed, all_data, adv, is_train):
        # input data
        # self.all_data = all_data
        self.is_train = is_train
        self.adv = adv

        # embedding initialization
        if word_embed is not None:
            self.word_dim = word_embed.shape[1]
            self.vocab_size = word_embed.shape[0]
            w_trainable = True if self.word_dim == 50 else False
            shape = None
        else:
            encoder = SubwordTextEncoder(get_vocab_file())
            self.word_dim = FLAGS.hidden_size
            self.vocab_size = encoder.vocab_size
            word_embed = tf.random_normal_initializer(0.0, self.word_dim**-0.5)
            w_trainable = True
            shape = [self.vocab_size, self.word_dim]

        self.word_embed = tf.get_variable('word_embed',
                                          initializer=word_embed,
                                          shape=shape,
                                          dtype=tf.float32,
                                          trainable=w_trainable)

        with tf.variable_scope("shared"):
            self.shared_conv = _get_model()
            self.shared_linear = tf.keras.layers.Dense(TASK_NUM,
                                                       activation=None,
                                                       name='leaner_shared')

        self.tensors = []
        self.pred = {}
        self.separate_acc = {}
        self.metric_tensors = []
        self.data = {}
        self.alignments = {}

        for task_name, data in all_data:
            with tf.variable_scope(task_name):
                self.build_task_graph(data, task_name)
Beispiel #11
0
def decode(s, array=False):
    encoder = SubwordTextEncoder(get_vocab_file())
    if array:
        return encoder.decode_list(s)
    return encoder.decode(s)
Beispiel #12
0
class Vocab:
    def __init__(self, model_config, vocab_path=None, lower=False):
        self.model_config = model_config
        self.vocab_path = vocab_path
        if 'bert_token' in self.model_config.bert_mode:
            self.i2w = [w.strip() for w in open(self.vocab_path)]
            self.w2i = dict(zip(self.i2w, range(len(self.i2w))))
            self.bert_tokenizer = WordpieceTokenizer(
                vocab=self.i2w, unk_token=constant.SYMBOL_UNK)
            print('Populate BERT word piece vocab with size %s' %
                  self.vocab_size())
        elif self.model_config.subword_vocab_size <= 0:
            self.init_vocab()
            if vocab_path is not None:
                self.populate_vocab()
        else:
            if vocab_path is not None:
                self.populate_subword_vocab()

    def populate_subword_vocab(self):
        self.subword = SubwordTextEncoder(self.vocab_path)
        print('Subword Vocab Populated with size %d for path %s.' %
              (len(self.subword._all_subtoken_strings), self.vocab_path))

    def init_vocab(self):
        self.w2i = {}
        self.i2w = []
        self.w2i[constant.SYMBOL_GO] = 0
        self.i2w.append(constant.SYMBOL_GO)
        self.w2i[constant.SYMBOL_PAD] = 1
        self.i2w.append(constant.SYMBOL_PAD)
        self.w2i[constant.SYMBOL_UNK] = 2
        self.i2w.append(constant.SYMBOL_UNK)
        self.w2i[constant.SYMBOL_START] = 3
        self.i2w.append(constant.SYMBOL_START)
        self.w2i[constant.SYMBOL_END] = 4
        self.i2w.append(constant.SYMBOL_END)
        unk_id = 0
        for voc_id in range(len(self.i2w), constant.REVERED_VOCAB_SIZE):
            self.w2i['#unk%s#' % unk_id] = voc_id
            self.i2w.append('#unk%s#' % unk_id)
            unk_id += 1

    def populate_vocab(self, mincount=-1, topcount=50000):
        mincount = max(mincount, self.model_config.min_count)
        topcount = min(topcount, self.model_config.top_count)

        lid = 0
        for line in open(self.vocab_path):
            items = line.strip().split('\t')
            w = items[0]
            if len(items) > 1:
                cnt = int(items[1])
            # else:
            #     # Accept all words
            #     cnt = 99999
            if cnt >= mincount:
                self.w2i[w] = len(self.i2w)
                self.i2w.append(w)

            lid += 1
            if lid >= topcount:
                break
        print(
            'Vocab Populated with size %d including %d reserved vocab for path %s.'
            % (len(self.i2w), constant.REVERED_VOCAB_SIZE, self.vocab_path))

    def encode(self, w):
        if 'bert_token' in self.model_config.bert_mode:
            return [self.w2i[w] for w in self.bert_tokenizer.tokenize(w)]
        elif self.model_config.subword_vocab_size <= 0:
            if w in self.w2i:
                return self.w2i[w]
            else:
                return self.w2i[constant.SYMBOL_UNK]
        else:
            return self.subword.encode(w)

    def contain(self, w):
        return w in self.w2i

    def describe(self, i):
        if 'bert_token' in self.model_config.bert_mode:
            return bert_utils.merge_tokens([self.i2w[ie] for ie in i])
        elif self.model_config.subword_vocab_size <= 0:
            if i < len(self.i2w):
                return self.i2w[i]
        else:
            # Note in subword case, i should be list of id, i.e. ids.
            return self.subword.decode(i)

    def vocab_size(self):
        if self.model_config.subword_vocab_size <= 0 or 'bert_token' in self.model_config.bert_mode:
            return len(self.i2w)
        else:
            return len(self.subword._all_subtoken_strings)

    @staticmethod
    def process_word(word, model_config):
        if word:
            if model_config.lower_case:
                word = word.lower()
            word = data_parse(word)
            return word
Beispiel #13
0
 def populate_subword_vocab(self):
     self.subword = SubwordTextEncoder(self.vocab_path)
     print('Subword Vocab Populated with size %d for path %s.' %
           (len(self.subword._all_subtoken_strings), self.vocab_path))