Ejemplo n.º 1
0
def create_demo_batch(sentences, dataset_type, vocab, 
                      attribute_name, target_columns, tmp_path='/tmp'):
  '''
  Args:
    sentences: List of string.
  '''
  # Create a temporary file.
  tmp_path = os.path.join(tmp_path, common.random_string(5))
  index = [i for i in xrange(len(sentences))]
  dic = {
    'index': index,
    'sentence': sentences,
  }
  for col in target_columns:
    dic[col] = [EMPTY]
  df = pd.DataFrame(dic).ix[:, ['index', 'sentence'] + target_columns].set_index('index')
  
  sys.stdout = sys.stderr
  with open(tmp_path, 'w') as f:
    f.write(df.to_csv() + '\n')
  pathes = common.dotDict({'train': tmp_path, 'valid':tmp_path, 'test':tmp_path})

  num_training_sentences = 0 # Fake value.
  dataset = getattr(self_module, dataset_type)(dataset_type, pathes, num_training_sentences, vocab, attribute_name, target_columns)
  dataset.test.load_data()
  os.system('rm %s' % tmp_path)
  return dataset.test
Ejemplo n.º 2
0
def main(args):
  if args.mode == 'train':
    sys.stderr.write('Saving config...\n')
    config = common.dotDict(args.__dict__)
    save_config(args)
  else:
    sys.stderr.write('Loading config...\n')
    config = load_config(args)

  model = getattr(myself, config.model_type)(args, config)

  if args.mode == 'train':
    model.train()
    
  elif args.mode == 'test':
    tests, origins = read_human_annotations(args.test_file)
    lines = [line for idx, line, anno in tests]
    predictions, cluster_ids = model.test(lines, 
                                          test_filepath=args.test_file)
    model.evaluate(tests, origins, predictions, cluster_ids=cluster_ids)
  elif args.mode == 'evaluate':
    tests, origins = read_human_annotations(args.test_file)
    predictions = read_dplabels()
    model.evaluate(tests, origins, predictions)
  else:
    raise ValueError('args.mode must be \'train\' or \'test\'.')
Ejemplo n.º 3
0
    def setup_embeddings(self, config, vocab):
        self.embeddings = dotDict()
        n_start_vocab = len(vocab.e_word.start_vocab)
        special_tokens_emb = self.initialize_embeddings(
            'SpecialTokens',
            vocab.e_word.embeddings[:n_start_vocab].shape,
            initializer=tf.constant_initializer(
                vocab.e_word.embeddings[:n_start_vocab]),
            trainable=True)
        # e_words_emb = tf.constant(vocab.e_word.embeddings[n_start_vocab:],
        #                           dtype=tf.float32)
        # j_words_emb = tf.constant(vocab.j_word.embeddings[n_start_vocab:],
        #                           dtype=tf.float32)
        e_words_emb = self.initialize_embeddings(
            'EnWords',
            vocab.e_word.embeddings[n_start_vocab:].shape,
            initializer=tf.constant_initializer(
                vocab.e_word.embeddings[n_start_vocab:]),
            trainable=config.train_embedding)
        j_words_emb = self.initialize_embeddings(
            'JPWords',
            vocab.j_word.embeddings[n_start_vocab:].shape,
            initializer=tf.constant_initializer(
                vocab.j_word.embeddings[n_start_vocab:]),
            trainable=config.train_embedding)

        self.embeddings.e_word = tf.concat([special_tokens_emb, e_words_emb],
                                           axis=0)
        self.embeddings.j_word = tf.concat([special_tokens_emb, j_words_emb],
                                           axis=0)
Ejemplo n.º 4
0
  def __init__(self, args, sess, vocab=None):
    self.sess = sess
    self.config = self.load_config(args)
    self.logger = common.logManager(handler=FileHandler(args.log_file)) if args.log_file else common.logManager()

    sys.stderr.write(str(self.config) + '\n')

    data_class = getattr(datasets, self.config.dataset_type)
    self.vocab = common.dotDict()
    if self.config.embeddings:
      emb_conf = self.config.embeddings
      self.vocab.e_word = vocabularies.WordVocabularyWithEmbedding(
        emb_conf.en.path, 
        vocab_size=self.config.w_vocab_size,
        lowercase=self.config.lowercase,
        normalize_digits=self.config.normalize_digits,
        skip_first=emb_conf.en.skip_first)
      self.vocab.j_word = vocabularies.WordVocabularyWithEmbedding(
        emb_conf.ja.path, 
        vocab_size=self.config.w_vocab_size,
        lowercase=self.config.lowercase,
        normalize_digits=self.config.normalize_digits,
        skip_first=emb_conf.ja.skip_first)
      self.c_vocab = None
    #self.w_vocab, self.c_vocab = data_class.create_vocab_from_data(self.config)
    self.dataset = data_class(self.config.dataset_info, 
                              self.vocab.e_word, self.c_vocab)
Ejemplo n.º 5
0
 def yield_batch(self, batch_by_column):
   b_sources, b_targets, b_ori_sources, b_pos= batch_by_column
   b_targets = list(zip(*b_targets)) # to column-major.
   return common.dotDict({
     'sources': np.array(b_sources),
     # Include only the labels in 'target_columns' to batch.
     'targets': [np.array(t) for t, col in zip(b_targets, self.all_columns) 
                 if col in self.target_columns],
     'original_sources': b_ori_sources,
     'pos': b_pos,
   })
Ejemplo n.º 6
0
  def load_data(self):
    self.load = True
    sys.stderr.write('Loading dataset from %s ...\n' % (self.path))
    df = pd.read_csv(self.path, nrows=self.max_lines)

    sys.stderr.write('Preprocessing ...\n')
    contexts, responses, speaker_changes = self.preprocess(df)

    if not self.wbase and not self.cbase:
      raise ValueError('Either \'wbase\' or \'cbase\' must be True.')

    self.speaker_changes = [self.sc_vocab.sent2id(sc) for sc in speaker_changes]

    # Separate contexts and responses into words (or chars), and convert them into their IDs.
    self.original = common.dotDict({})
    self.symbolized = common.dotDict({})

    if self.wbase:
      self.original.w_contexts = [[self.w_vocab.tokenizer(u) for u in context] 
                                  for context in contexts]
      self.symbolized.w_contexts = [[self.w_vocab.sent2id(u) for u in context] 
                                    for context in self.original.w_contexts]
    else:
      self.original.w_contexts = [None for context in contexts] 
      self.symbolized.w_contexts = [None for context in contexts] 

    if self.cbase:
      self.original.c_contexts = [[self.c_vocab.tokenizer(u) for u in context] 
                                  for context in contexts]

      self.symbolized.c_contexts = [[self.c_vocab.sent2id(u) for u in context] 
                                    for context in self.original.c_contexts]
    else:
      self.original.c_contexts = [None for context in contexts]
      self.symbolized.c_contexts = [None for context in contexts]
    self.original.responses = [self.w_vocab.tokenizer(r) for r in responses]
    self.symbolized.responses = [self.w_vocab.sent2id(r) for r in responses]

    responses = self.symbolized.responses
    w_contexts = self.symbolized.w_contexts 
    self.texts = common.flatten(w_contexts) + list(responses)
Ejemplo n.º 7
0
  def yield_batch(self, batch):
    '''
    Args
      - batch: A list of a list containing 'batch_size' examples (specified as an argument to get_batch()), batch[i] contains each of the return values of get_batch_data().   (i.e. the shape of 'batch' = [len(self.get_batch_data(...)), batch_size]).

    Return : A batch as a dictionary.
    '''
    b_sources, b_targets, b_ori_sources = batch
    b_targets = list(zip(*b_targets)) # to column-major.

    return common.dotDict({
      'sources': np.array(b_sources),
      # Include only the labels in 'target_columns' to batch.
      'targets': [np.array(t) for t, col in zip(b_targets, self.all_columns) if col in self.target_columns],
      'original_sources': b_ori_sources,
    })
Ejemplo n.º 8
0
    def get_batch(self,
                  batch_size,
                  input_max_len=None,
                  output_max_len=None,
                  shuffle=False):
        sources, targets = self.symbolized
        if input_max_len:
            paired = [(s, t) for s, t in zip(sources, targets)
                      if not len(s) > input_max_len]
            sources, targets = list(zip(*paired))

        sources = tf.keras.preprocessing.sequence.pad_sequences(
            sources,
            maxlen=input_max_len,
            padding='post',
            truncating='post',
            value=PAD_ID)
        targets = list(zip(*targets))  # to column-major. (for padding)
        targets = [
            tf.keras.preprocessing.sequence.pad_sequences(
                targets_by_column,
                maxlen=output_max_len,
                padding='post',
                truncating='post',
                value=PAD_ID) for targets_by_column in targets
        ]
        targets = list(zip(*targets))  # to idx-major. (for shuffling)

        data = [
            tuple(x)
            for x in zip(sources, targets, self.original_sources, self.targets)
        ]

        if shuffle:
            random.shuffle(data)
        for i, b in itertools.groupby(enumerate(data), lambda x: x[0] //
                                      (batch_size)):
            batch = [x[1] for x in b]
            b_sources, b_targets, b_ori_sources, b_ori_targets = zip(*batch)
            b_targets = list(zip(*b_targets))  # to column-major.
            yield common.dotDict({
                'sources': np.array(b_sources),
                'targets': [np.array(t) for t in b_targets],
                'original_sources': b_ori_sources,
                'original_targets': b_ori_targets,
            })
Ejemplo n.º 9
0
def load_config(args):
  if os.path.exists(os.path.join(args.output_dir, CONFIG_NAME + '.txt')):
    config = collections.defaultdict()
    for l in open(os.path.join(args.output_dir, CONFIG_NAME + '.txt')):
      k, v, type_name = l.replace('\n', '').split('\t')
      if type_name == 'tuple':
        config[k] = common.str2tuple(v)
      elif type_name == 'int':
        config[k] = int(v)
      elif type_name == 'float':
        config[k] = float(v)
      else:
        config[k] = v
    config = common.dotDict(config)
  else:
    raise ValueError('No config file is found.')
  sys.stderr.write(str(config)+'\n')
  return config
Ejemplo n.º 10
0
    def create_demo_batch(self, text, output_max_len):
        source = [self.vocab.tokens2ids(text)]
        targets = [[[0] for _ in self.targets_name]]
        targets = list(zip(*targets))  # to column-major. (for padding)
        source = tf.keras.preprocessing.sequence.pad_sequences(
            source, padding='post', truncating='post', value=PAD_ID)
        targets = [
            tf.keras.preprocessing.sequence.pad_sequences(
                targets_by_column,
                maxlen=output_max_len,
                padding='post',
                truncating='post',
                value=PAD_ID) for targets_by_column in targets
        ]

        yield common.dotDict({
            'sources': np.array(source),
            'targets': [np.array(t) for t in targets],
        })
Ejemplo n.º 11
0
    def __init__(self, args, sess, vocab=None):
        self.sess = sess
        self.config = config = self.get_config(args)
        self.mode = args.mode
        self.logger = common.logManager(handler=FileHandler(
            args.log_file)) if args.log_file else common.logManager()

        sys.stderr.write(str(self.config) + '\n')
        # Lazy loading.
        self.dataset = common.dotDict({
            'train': None,
            'valid': None,
            'test': None
        })
        self.dataset_type = getattr(datasets, config.dataset_type)
        if not args.interactive:  # For saving time when running in jupyter.
            self.vocab = WordVocabularyWithEmbedding(
                config.embeddings,
                vocab_size=config.vocab_size,
                lowercase=config.lowercase) if vocab is None else vocab
Ejemplo n.º 12
0
    def __init__(self, args, sess, vocab=None):
        self.sess = sess
        self.config = self.load_config(args)
        self.mode = args.mode
        self.logger = common.logManager(handler=FileHandler(
            args.log_file)) if args.log_file else common.logManager()

        sys.stderr.write(str(self.config) + '\n')

        if True or not args.interactive:
            self.vocab = common.dotDict()
            self.vocab.word = WordVocabularyWithEmbedding(
                self.config.embeddings,
                vocab_size=self.config.vocab_size,
                lowercase=self.config.lowercase,
                normalize_digits=self.config.normalize_digits,
            ) if vocab is None else vocab
            self.dataset = getattr(datasets, self.config.dataset_type)(
                self.config.dataset_type, self.config.dataset_path,
                self.config.num_train_data, self.vocab,
                self.config.target_attribute, self.config.target_columns)
Ejemplo n.º 13
0
  def get_batch(self, batch_size, word_max_len=0,
                utterance_max_len=0, shuffle=False):
    if not self.load:
      self.load_data() # lazy loading.

    data = self.texts
    if shuffle: # For training.
      random.shuffle(data)
    for i, b in itertools.groupby(enumerate(data), 
                                  lambda x: x[0] // batch_size):
      batch = [x[1] for x in b]
      texts = batch
      _utterance_max_len_data = max([len(u) for u in texts]) 
      if not utterance_max_len or _utterance_max_len_data < utterance_max_len:
        _utterance_max_len = _utterance_max_len_data
      else:
        _utterance_max_len = utterance_max_len
      texts = np.array(texts)
      texts = tf.keras.preprocessing.sequence.pad_sequences(
        texts, maxlen=_utterance_max_len, 
        padding='post', truncating='post', value=PAD_ID)
      yield common.dotDict({
        'texts': texts
      })