Beispiel #1
0
class Dataset(Configurable):
    """"""

    # =============================================================
    def __init__(self, filename, vocabs, builder, *args, **kwargs):
        """"""

        super(Dataset, self).__init__(*args, **kwargs)
        self.vocabs = vocabs

        self.train_domains_set = (set(self.train_domains.split(',')) if
                                  (self.train_domains != '-'
                                   and self.name == "Trainset") else set())
        # print("Loading training data from domains:", self.train_domains_set
        # if self.train_domains_set else "all")

        self._file_iterator = self.file_iterator(filename)
        self._train = (filename == self.train_file)
        self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
        self._data = None
        self.reset([])
        if filename:
            self.rebucket()

        if self.use_elmo:
            from lib.models import ElmoLSTMEncoder
            with tf.compat.v1.variable_scope(tf.get_variable_scope(),
                                             reuse=(self.name != "Trainset")):
                self.elmo_encoder = ElmoLSTMEncoder(self)

        self.inputs = tf.compat.v1.placeholder(dtype=tf.int32,
                                               shape=(None, None, None),
                                               name='inputs')
        self.targets = tf.compat.v1.placeholder(dtype=tf.int32,
                                                shape=(None, None, None),
                                                name='targets')
        self.step = tf.compat.v1.placeholder_with_default(0.,
                                                          shape=None,
                                                          name='step')
        self.builder = builder()

    # =============================================================
    def file_iterator(self, filename):
        """"""
        if not filename:
            yield [[]]
        # print(f"Dataset.file_iterator {filename}")
        with open(filename) as f:
            if self.lines_per_buffer > 0:
                buff = [[]]
                while True:
                    line = f.readline()
                    while line:
                        # print(f"Dataset.file_iterator line: {line}")
                        line = line.strip().split()
                        if line and (not self.train_domains_set
                                     or line[0].split('/')[0]
                                     in self.train_domains):
                            buff[-1].append(line)
                        else:
                            if len(buff) < self.lines_per_buffer:
                                if len(buff[-1]) > 0:
                                    buff.append([])
                                else:
                                    buff[-1] = []
                            else:
                                break
                        line = f.readline()
                    if not line:
                        f.seek(0)
                    else:
                        buff = self._process_buff(buff)
                        yield buff
                        line = line.strip().split()
                        if line:
                            buff = [[line]]
                        else:
                            buff = [[]]
            else:
                buff = [[]]
                for line in f:
                    line = line.strip().split()
                    # print(f"Dataset.file_iterator line: {line}")
                    if line and (not self.train_domains_set or
                                 line[0].split('/')[0] in self.train_domains):
                        buff[-1].append(line)
                    else:
                        if len(buff[-1]) > 0:
                            buff.append([])
                        else:
                            buff[-1] = []
                if buff[-1] == []:
                    buff.pop()
                buff = self._process_buff(buff)
                while True:
                    yield buff

    # =============================================================
    def _process_buff(self, buff):
        """"""

        # tmp_f = open("debug_data_%s" % self.name, 'w')

        words, tags, rels, srls, predicates, domains = self.vocabs
        srl_start_field = srls.conll_idx[0]
        sents = 0
        toks = 0
        examples = 0
        total_predicates = 0
        buff2 = []
        for i, sent in enumerate(buff):
            sents += 1
            sent_len = len(sent)
            num_fields = len(sent[0])
            srl_take_indices = [
                idx for idx in list(
                    range(srl_start_field, srl_start_field + sent_len))
                if idx < num_fields - 1 and (self.train_on_nested or np.all(
                    ['/' not in sent[j][idx] for j in list(range(sent_len))]))
            ]
            predicate_indices = []
            for j, token in enumerate(sent):
                toks += 1
                if self.conll:
                    word, tag1, tag2, head, rel = (token[words.conll_idx],
                                                   token[tags.conll_idx[0]],
                                                   token[tags.conll_idx[1]],
                                                   token[6],
                                                   token[rels.conll_idx])
                    if rel == 'root':
                        head = j
                    else:
                        head = int(head) - 1
                    buff[i][j] = ((word, ) + words[word] + tags[tag1] +
                                  tags[tag2] + (head, ) + rels[rel])
                elif self.conll2012:
                    # print(f"Dataset._process_buff token {j}:{token}")
                    word, auto_tag, gold_tag, head, rel = (
                        token[words.conll_idx], token[tags.conll_idx[0]],
                        token[tags.conll_idx[1]], token[6],
                        token[rels.conll_idx])
                    # print(f"Dataset token read {word}, {auto_tag},
                    # {gold_tag}, {head}, {rel}")
                    domain = token[0].split('/')[0]
                    # print(word, tag1, tag2, head, rel)
                    if rel == 'root':
                        head = j
                    else:
                        head = int(head) - 1

                    # srl_fields = [token[idx] if idx < len(token)-1 else 'O'
                    # for idx in list(range(srl_start_field, srl_start_field
                    # + sent_len))
                    # todo can we use fancy indexing here?
                    srl_fields = [token[idx] for idx in srl_take_indices]
                    srl_fields += ['O'] * (sent_len - len(srl_take_indices))
                    srl_tags = [srls[s][0] for s in srl_fields]

                    if self.joint_pos_predicates:
                        is_predicate = (token[predicates.conll_idx[0]] != '-'
                                        and
                                        (self.train_on_nested
                                         or self.predicate_str in srl_fields))
                        tok_predicate_str = str(is_predicate) + '/' + gold_tag
                    else:
                        is_predicate = (token[predicates.conll_idx] != '-' and
                                        (self.train_on_nested
                                         or self.predicate_str in srl_fields))
                        tok_predicate_str = str(is_predicate)

                    if is_predicate:
                        predicate_indices.append(j)

                    buff[i][j] = ((word, ) + words[word] + tags[auto_tag] +
                                  predicates[tok_predicate_str] +
                                  domains[domain] + (sents, ) +
                                  tags[gold_tag] + (head, ) + rels[rel] +
                                  tuple(srl_tags))
                    # print(f"Dataset buff[{i}][{j}] = {buff[i][j]}")
                    # print(f"Dataset buff {word}, {word}, {auto_tag},
                    # {tok_predicate_str}, {domain}, {sents}, {gold_tag},
                    # {head}, {rel}, {srl_tags}")

            # Expand sentences into one example per predicate
            if self.one_example_per_predicate:
                # grab the sent
                # should be sent_len x sent_elements
                sent = np.array(buff[i])
                # print(sent)
                is_predicate_idx = 4
                srl_start_idx = 10
                word_part = sent[:, 0].astype('O')
                srl_part = sent[:, srl_start_idx:].astype(np.int32)
                rest_part = sent[:, 1:srl_start_idx].astype(np.int32)
                # print("orig sent (%d):" % len(predicate_indices),
                # sent[:, :8+len(predicate_indices)])
                # print("orig preds:", [map(lambda x: srls[int(x)], t) for t
                # in sent[:, srl_start_idx:srl_start_idx+len(
                # predicate_indices)]])
                if predicate_indices:
                    for k, p_idx in enumerate(predicate_indices):
                        # should be sent_len x sent_elements
                        rest_part[:, is_predicate_idx -
                                  1] = predicates["False"][0]
                        rest_part[p_idx,
                                  is_predicate_idx - 1] = predicates["True"][0]
                        correct_srls = srl_part[:, k]
                        new_sent = np.concatenate([
                            np.expand_dims(word_part, -1), rest_part,
                            np.expand_dims(correct_srls, -1)
                        ],
                                                  axis=1)
                        buff2.append(new_sent)
                        # print("new sent:", new_sent)
                        # print("new preds:", map(lambda x: srls[int(x)],
                        # new_sent[:, -1]))
                        # tokens_str = ' '.join(word_part)
                        # labels_str = ' '.join(map(lambda x: srls[x],
                        # correct_srls))
                        # # idx, tokens, labels
                        # print("%d %s ||| %s" % (p_idx, tokens_str,
                        # labels_str), file=tmp_f)
                        total_predicates += 1
                        examples += 1
                else:
                    new_sent = np.concatenate(
                        [np.expand_dims(word_part, -1), rest_part], axis=1)
                    buff2.append(new_sent)
                    examples += 1
            # else:
            #     buff2.append(np.concatenate[np.expand_dims(word_part, -1),
            # rest_part, srl_part], axis=1) #(sent[0],) + map(int, sent[1:]))
            #     examples += 1
        # tmp_f.close()
        if self.one_example_per_predicate:
            # print("Loaded %d sentences with %d tokens, %d examples
            # (%d predicates) (%s)" % (sents, toks, examples,
            # total_predicates, self.name))
            return buff2
        else:
            # print(f"Loaded {sents} sentences with {toks} tokens {self.name}")
            return buff

    # =============================================================
    def reset(self, sizes):
        """"""

        self._data = []
        self._targets = []
        self._metabucket.reset(sizes)
        return

    # =============================================================
    def rebucket(self):
        """"""

        buff = next(self._file_iterator)
        # print(f"Dataset.rebucket {buff}")
        len_cntr = Counter()

        for sent in buff:
            len_cntr[len(sent)] += 1
        n_bkts = self.n_bkts if len(len_cntr) >= self.n_bkts else len(len_cntr)
        # print(f"Dataset.rebucket n_bkts: {n_bkts}, {self.n_bkts}")
        self._metabucket = Metabucket(self._config, n_bkts=n_bkts)
        splits = KMeans(n_bkts, len_cntr).splits
        # print(f"Dataset.rebucket splits: {splits}")
        self.reset(splits)

        for sent in buff:
            self._metabucket.add(sent)
        self._finalize()
        return

    # =============================================================
    def _finalize(self):
        """"""

        self._metabucket._finalize()
        return

    # =============================================================
    def get_minibatches(self,
                        batch_size,
                        input_idxs,
                        target_idxs,
                        shuffle=True):
        """"""

        # print(f"Dataset.get_minibatches {batch_size}, {input_idxs},
        # {target_idxs}, {shuffle}")
        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            feed_dict = {}
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            try:
                maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))
            except IndexError as e:
                print(f"IndexError while enumerating minibatches",
                      file=sys.stderr)
                raise DatasetError(f"IndexError while enumerating minibatches")

            # np.set_printoptions(threshold=np.nan)
            # print("maxlen", maxlen)
            # print("maxlen+max(target_idxs)", maxlen+max(target_idxs))
            # print("data.shape[2]", data.shape[2])
            # targets = data[:,:maxlen,min(target_idxs):
            # maxlen+max(target_idxs)+1]
            # print("targets shape", targets.shape)
            # print("data[:,:,3:] shape", targets[:,:,3:].shape)

            feed_dict.update({
                self.inputs:
                data[:, :maxlen, input_idxs],
                self.targets:
                data[:, :maxlen,
                     min(target_idxs):maxlen + max(target_idxs) + 1]
            })
            if self.use_elmo:
                feed_dict = self.elmo_encoder.get_feed_dict(feed_dict, sents)
            # print(f"Dataset.get_minibatches yields {feed_dict}")
            yield feed_dict, sents

    # =============================================================
    @property
    def n_bkts(self):
        if self._train:
            return super(Dataset, self).n_bkts
        else:
            return super(Dataset, self).n_valid_bkts

    # =============================================================
    def max_batch_size(self):
        # print([b._data.shape[0] for b in self._metabucket._buckets])
        max_batch_size = np.max(
            [b._data.shape[0] for b in self._metabucket._buckets])
        # print("max batch size: ", max_batch_size)

        if self.name in ["Testset", "Analyzeset"]:
            return self.max_test_batch_size
        elif self.name == "Validset":
            return self.max_dev_batch_size
        return max_batch_size

    # =============================================================
    def __getitem__(self, key):
        return self._metabucket[key]

    def __len__(self):
        return len(self._metabucket)
Beispiel #2
0
class Dataset(Configurable):
  """"""

  #=============================================================
  def __init__(self, filename, vocabs, builder, *args, **kwargs):
    """"""
    self.forest_file_name = kwargs.pop("forest_file", None)

    if self.forest_file_name is not None:
      print("[tlog] self.forest_file_name: " + self.forest_file_name)

    super(Dataset, self).__init__(*args, **kwargs)
    self.vocabs = vocabs
    self._file_iterator = self.file_iterator(filename)
    self._train = (filename == self.train_file)
    self._forest_data = self.load_forest_file(self.forest_file_name)
    self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
    self._data = None
    self.rebucket()

    self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs')
    self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets')
    self.in_neighbor = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor') # [batch, word, neigh]
    self.in_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor_rel') # [batch, word, neigh]
    self.in_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None),
                                      name='in_neighbor_mask')  # [batch, word, neigh]
    self.out_neighbor = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor')  # [batch, word, neigh]
    self.out_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None, None, None),
                                          name='out_neighbor_rel')  # [batch, word, neigh]
    self.out_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None),
                                       name='out_neighbor_mask')  # [batch, word, neigh]
    self.builder = builder()

  #=============================================================
  def file_iterator(self, filename):
    """"""

    with open(filename) as f:
      if self.lines_per_buffer > 0:
        buff = [[]]
        while True:
          line = f.readline()
          while line:
            line = line.strip().split()
            if line:
              buff[-1].append(line)
            else:
              if len(buff) < self.lines_per_buffer:
                if buff[-1]:
                  buff.append([])
              else:
                break
            line = f.readline()
          if not line:
            f.seek(0)
          else:
            buff = self._process_buff(buff)
            yield buff
            line = line.strip().split()
            if line:
              buff = [[line]]
            else:
              buff = [[]]
      else:
        buff = [[]]
        for line in f:
          line = line.strip().split()
          if line:
            buff[-1].append(line)
          else:
            if buff[-1]:
              buff.append([])
        if buff[-1] == []:
          buff.pop()
        buff = self._process_buff(buff)
        while True:
          yield buff

  #=============================================================
  def _remove_duplicate_items(self, node_index, neighbor, neighbor_rel, add_self=True, REL_UNK=2):
    unique_neighbor, unique_neighbor_rel = [], []
    node_cache = set()
    if add_self:
      unique_neighbor.append(node_index)
      unique_neighbor_rel.append(REL_UNK)
      node_cache.add((node_index, REL_UNK))
    for i in range(len(neighbor)):
      if (neighbor[i], neighbor_rel[i]) not in node_cache:
        unique_neighbor.append(neighbor[i])
        unique_neighbor_rel.append(neighbor_rel[i])
        node_cache.add((neighbor[i], neighbor_rel[i]))
    return unique_neighbor, unique_neighbor_rel

  # =============================================================
  def _process_buff(self, buff):
    """"""

    words, tags, rels = self.vocabs
    for i, sent in enumerate(buff):
      if self.use_forest:
        sent_str = tuple([token[1] for token in sent])
        triples, adj_lists = self._forest_data[len(sent_str)][sent_str]
        #print("[tlog] adj_lists: " + str(adj_lists))
        #sys.exit(0)
      for j, token in enumerate(sent):
        word, tag1, tag2, head, rel = token[words.conll_idx], \
                token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]
        if self.use_forest:
          #print("[tlog] adj_lists: " + str(adj_lists[0][j + 1]) + "\t" + str(adj_lists[1][j + 1]))
          unique_in_neighbor, unique_in_neighbor_rel = self._remove_duplicate_items(j+1,
                  adj_lists[0][j+1], adj_lists[1][j+1])
          unique_out_neighbor, unique_out_neighbor_rel = self._remove_duplicate_items(j+1,
                  adj_lists[2][j+1], adj_lists[3][j+1])
          #print("[tlog] adj_lists: " + str(adj_lists[0][j + 1]) + "\t" + str(adj_lists[1][j + 1]))
          #sys.exit(0)
          buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel] + \
                  (unique_in_neighbor, unique_in_neighbor_rel, unique_out_neighbor, unique_out_neighbor_rel)
        else:
          buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel]

      if self.use_forest:
        unique_in_neighbor, unique_in_neighbor_rel = self._remove_duplicate_items(0,
                adj_lists[0][0], adj_lists[1][0])
        unique_out_neighbor, unique_out_neighbor_rel = self._remove_duplicate_items(0,
                adj_lists[2][0], adj_lists[3][0])
        sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT, \
                        unique_in_neighbor, unique_in_neighbor_rel, unique_out_neighbor, unique_out_neighbor_rel))
      else:
        sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT))
    return buff

  # =============================================================
  def load_forest_file(self, forest_file_name):
    if forest_file_name is None or not self.use_forest:
      return
    if self.forest_type == 0:
      return load_nbest(forest_file_name, self.nbest_only_keep, self.vocabs[2])
    elif self.forest_type == 1:
      return load_cube(forest_file_name, self.cube_only_keep)
    elif self.forest_type == 2:
      return load_cube(forest_file_name, self.nbest_only_keep)
    elif self.forest_type == 3:
      return load_cubesparse(forest_file_name, self.cube_only_keep, self.vocabs[2])
    else:
      print("[Error] forest_type must be in [0, 1, 2]\n " +
            "\t 0 --- nbest, 10 \n" +
            "\t 1 --- cube, 0.05 \n" +
            "\t 2 --- cube, 10 \n" +
            "\t 3 --- cubesparse, 0.05 \n")
      sys.exit(0)
  #=============================================================
  def reset(self, sizes):
    """"""

    self._data = []
    self._targets = []
    self._metabucket.reset(sizes)
    return

  #=============================================================
  def rebucket(self):
    """"""

    buff = self._file_iterator.next()
    len_cntr = Counter()

    for sent in buff:
      len_cntr[len(sent)] += 1
    self.reset(KMeans(self.n_bkts, len_cntr).splits)

    for sent in buff:
      self._metabucket.add(sent)
    self._finalize()
    return

  #=============================================================
  def _finalize(self):
    """"""

    self._metabucket._finalize()
    return

  #=============================================================
  def get_minibatches(self, batch_size, input_idxs, target_idxs, forest_idxs=None, shuffle=True):
    """"""

    minibatches = []
    for bkt_idx, bucket in enumerate(self._metabucket):
      if batch_size == 0:
        n_splits = 1
      else:
        n_tokens = len(bucket) * bucket.size
        n_splits = max(n_tokens // batch_size, 1)
      if shuffle:
        range_func = np.random.permutation
      else:
        range_func = np.arange
      arr_sp = np.array_split(range_func(len(bucket)), n_splits)
      for bkt_mb in arr_sp:
        minibatches.append((bkt_idx, bkt_mb))
    if shuffle:
      np.random.shuffle(minibatches)
    for bkt_idx, bkt_mb in minibatches:
      feed_dict = {}
      data = self[bkt_idx].data[bkt_mb]
      sents = self[bkt_idx].sents[bkt_mb]




      maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))
      feed_dict.update({
        self.inputs: data[:, :maxlen, input_idxs],
        self.targets: data[:, :maxlen, target_idxs]
      })
      if self.use_forest and forest_idxs is not None:

        in_neighbor_data = self[bkt_idx].in_neighbor_data[bkt_mb]
        in_neighbor_rel_data = self[bkt_idx].in_neighbor_rel_data[bkt_mb]
        in_neighbor_mask= self[bkt_idx].in_neighbor_mask[bkt_mb]

        out_neighbor_data = self[bkt_idx].out_neighbor_data[bkt_mb]
        out_neighbor_rel_data = self[bkt_idx].out_neighbor_rel_data[bkt_mb]
        out_neighbor_mask = self[bkt_idx].out_neighbor_mask[bkt_mb]
        feed_dict.update({
          self.in_neighbor: in_neighbor_data[:, :maxlen],
          self.in_neighbor_rel: in_neighbor_rel_data[:, :maxlen],
          self.in_neighbor_mask: in_neighbor_mask[:, :maxlen],

          self.out_neighbor: out_neighbor_data[:, :maxlen],
          self.out_neighbor_rel: out_neighbor_rel_data[:, :maxlen],
          self.out_neighbor_mask: out_neighbor_mask[:, :maxlen],
        })
      yield feed_dict, sents

  #=============================================================
  @property
  def n_bkts(self):
    if self._train:
      return super(Dataset, self).n_bkts
    else:
      return super(Dataset, self).n_valid_bkts

  #=============================================================
  def __getitem__(self, key):
    return self._metabucket[key]
  def __len__(self):
    return len(self._metabucket)
Beispiel #3
0
class Dataset(Configurable):
  """"""
  
  #=============================================================
  def __init__(self, filename, vocabs, builder, *args, **kwargs):
    """"""
    
    super(Dataset, self).__init__(*args, **kwargs)
    self._file_iterator = self.file_iterator(filename)
    self._train = (filename == self.train_file)
    self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
    self._data = None
    self.vocabs = vocabs
    self.rebucket()
    
    self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs')
    self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets')
    self.builder = builder()
  
  #=============================================================
  def file_iterator(self, filename):
    """"""
    
    with open(filename) as f:
      if self.lines_per_buffer > 0:
        buff = [[]]
        while True:
          line = f.readline()
          while line:
            line = line.strip().split()
            if line:
              buff[-1].append(line)
            else:
              if len(buff) < self.lines_per_buffer:
                if buff[-1]:
                  buff.append([])
              else:
                break
            line = f.readline()
          if not line:
            f.seek(0)
          else:
            buff = self._process_buff(buff)
            yield buff
            line = line.strip().split()
            if line:
              buff = [[line]]
            else:
              buff = [[]]
      else:
        buff = [[]]
        for line in f:
          line = line.strip().split()
          if line:
            buff[-1].append(line)
          else:
            if buff[-1]:
              buff.append([])
        if buff[-1] == []:
          buff.pop()
        buff = self._process_buff(buff)
        while True:
          yield buff
  
  #=============================================================
  def _process_buff(self, buff):
    """"""
    
    words, tags, rels = self.vocabs
    for i, sent in enumerate(buff):
      for j, token in enumerate(sent):
        word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]
        buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel]
      sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT))
    return buff
  
  #=============================================================
  def reset(self, sizes):
    """"""
    
    self._data = []
    self._targets = []
    self._metabucket.reset(sizes)
    return
  
  #=============================================================
  def rebucket(self):
    """"""
    
    buff = self._file_iterator.next()
    len_cntr = Counter()
    
    for sent in buff:
      len_cntr[len(sent)] += 1
    self.reset(KMeans(self.n_bkts, len_cntr).splits)
    
    for sent in buff:
      self._metabucket.add(sent)
    self._finalize()
    return
  
  #=============================================================
  def _finalize(self):
    """"""
    
    self._metabucket._finalize()
    return
  
  #=============================================================
  def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True):
    """"""
    
    minibatches = []
    for bkt_idx, bucket in enumerate(self._metabucket):
      if batch_size == 0:
        n_splits = 1
      #elif not self.minimize_pads:
      #  n_splits = max(len(bucket) // batch_size, 1)
      #  if bucket.size > 100:
      #    n_splits *= 2
      else:
        n_tokens = len(bucket) * bucket.size
        n_splits = max(n_tokens // batch_size, 1)
      if shuffle:
        range_func = np.random.permutation
      else:
        range_func = np.arange
      arr_sp = np.array_split(range_func(len(bucket)), n_splits)
      for bkt_mb in arr_sp:
        minibatches.append( (bkt_idx, bkt_mb) )
    if shuffle:
      np.random.shuffle(minibatches)
    for bkt_idx, bkt_mb in minibatches:
      data = self[bkt_idx].data[bkt_mb]
      sents = self[bkt_idx].sents[bkt_mb]
      maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1))
      feed_dict = {
        self.inputs: data[:,:maxlen,input_idxs],
        self.targets: data[:,:maxlen,target_idxs]
      }
      yield feed_dict, sents
  
  #=============================================================
  def get_minibatches2(self, batch_size, input_idxs, target_idxs):
    """"""
    
    bkt_lens = np.empty(len(self._metabucket))
    for i, bucket in enumerate(self._metabucket):
      bkt_lens[i] = len(bucket)
    
    total_sents = np.sum(bkt_lens)
    bkt_probs = bkt_lens / total_sents
    n_sents = 0
    while n_sents < total_sents:
      n_sents += batch_size
      bkt = np.random.choice(self._metabucket._buckets, p=bkt_probs)
      data = bkt.data[np.random.randint(len(bkt), size=batch_size)]
      if bkt.size > 100:
        for data_ in np.array_split(data, 2):
          feed_dict = {
            self.inputs: data_[:,:,input_idxs],
            self.targets: data_[:,:,target_idxs]
          }
          yield feed_dict
      else:
        feed_dict = {
          self.inputs: data[:,:,input_idxs],
          self.targets: data[:,:,target_idxs]
        }
        yield feed_dict
  
  #=============================================================
  @property
  def n_bkts(self):
    if self._train:
      return super(Dataset, self).n_bkts
    else:
      return super(Dataset, self).n_valid_bkts
  
  #=============================================================
  def __getitem__(self, key):
    return self._metabucket[key]
  def __len__(self):
    return len(self._metabucket)
Beispiel #4
0
class Dataset(Configurable):
    """"""

    #=============================================================
    def __init__(self, filename, vocabs, builder, *args, **kwargs):
        """"""

        super(Dataset, self).__init__(*args, **kwargs)
        self.vocabs = vocabs

        self.train_domains_set = set(
            self.train_domains.split(',')
        ) if self.train_domains != '-' and self.name == "Trainset" else set()
        print("Loading training data from domains:",
              self.train_domains_set if self.train_domains_set else "all")

        self._file_iterator = self.file_iterator(filename)
        self._train = (filename == self.train_file)
        self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
        self._data = None
        self.rebucket()

        self.inputs = tf.placeholder(dtype=tf.int32,
                                     shape=(None, None, None),
                                     name='inputs')
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=(None, None, None),
                                      name='targets')
        self.step = tf.placeholder_with_default(0., shape=None, name='step')
        self.builder = builder()

    #=============================================================
    def file_iterator(self, filename):
        """"""

        with open(filename) as f:
            if self.lines_per_buffer > 0:
                buff = [[]]
                while True:
                    line = f.readline()
                    while line:
                        line = line.strip().split()
                        if line and (not self.train_domains_set
                                     or line[0].split('/')[0]
                                     in self.train_domains):
                            buff[-1].append(line)
                        else:
                            if len(buff) < self.lines_per_buffer:
                                if len(buff[-1]) > 0:
                                    buff.append([])
                                else:
                                    buff[-1] = []
                            else:
                                break
                        line = f.readline()
                    if not line:
                        f.seek(0)
                    else:
                        buff = self._process_buff(buff)
                        yield buff
                        line = line.strip().split()
                        if line:
                            buff = [[line]]
                        else:
                            buff = [[]]
            else:
                buff = [[]]
                for line in f:
                    line = line.strip().split()
                    if line and (not self.train_domains_set or
                                 line[0].split('/')[0] in self.train_domains):
                        buff[-1].append(line)
                    else:
                        if len(buff[-1]) > 0:
                            buff.append([])
                        else:
                            buff[-1] = []
                if buff[-1] == []:
                    buff.pop()
                buff = self._process_buff(buff)
                while True:
                    yield buff

    #=============================================================
    def _process_buff(self, buff):
        """"""

        # tmp_f = open("debug_data_%s" % self.name, 'w')

        words, tags, rels, srls, predicates, domains = self.vocabs
        srl_start_field = srls.conll_idx[0]
        sents = 0
        toks = 0
        examples = 0
        total_predicates = 0
        buff2 = []
        for i, sent in enumerate(buff):
            # if not self.conll2012 or (self.conll2012 and len(list(sent)) > 1):
            # print(sent, len(sent))
            sents += 1
            sent_len = len(sent)
            num_fields = len(sent[0])
            srl_take_indices = [
                idx
                for idx in range(srl_start_field, srl_start_field + sent_len)
                if idx < num_fields - 1 and (self.train_on_nested or np.all(
                    ['/' not in sent[j][idx] for j in range(sent_len)]))
            ]
            predicate_indices = []
            for j, token in enumerate(sent):
                toks += 1
                if self.conll:
                    word, tag1, tag2, head, rel = token[
                        words.conll_idx], token[tags.conll_idx[0]], token[
                            tags.conll_idx[1]], token[6], token[rels.conll_idx]
                    if rel == 'root':
                        head = j
                    else:
                        head = int(head) - 1
                    buff[i][j] = (
                        word, ) + words[word] + tags[tag1] + tags[tag2] + (
                            head, ) + rels[rel]
                elif self.conll2012:
                    word, auto_tag, gold_tag, head, rel = token[
                        words.conll_idx], token[tags.conll_idx[0]], token[
                            tags.conll_idx[1]], token[6], token[rels.conll_idx]
                    domain = token[0].split('/')[0]
                    # print(word, tag1, tag2, head, rel)
                    if rel == 'root':
                        head = j
                    else:
                        head = int(head) - 1

                    # srl_fields = [token[idx] if idx < len(token)-1 else 'O' for idx in range(srl_start_field, srl_start_field + sent_len)]
                    srl_fields = [token[idx] for idx in srl_take_indices
                                  ]  # todo can we use fancy indexing here?
                    srl_fields += ['O'] * (sent_len - len(srl_take_indices))
                    srl_tags = [srls[s][0] for s in srl_fields]

                    if self.joint_pos_predicates:
                        is_predicate = token[
                            predicates.conll_idx[0]] != '-' and (
                                self.train_on_nested
                                or self.predicate_str in srl_fields)
                        tok_predicate_str = str(is_predicate) + '/' + gold_tag
                    else:
                        is_predicate = token[predicates.conll_idx] != '-' and (
                            self.train_on_nested
                            or self.predicate_str in srl_fields)
                        tok_predicate_str = str(is_predicate)

                    if is_predicate:
                        predicate_indices.append(j)

                    buff[i][j] = (
                        word, ) + words[word] + tags[auto_tag] + predicates[
                            tok_predicate_str] + domains[domain] + (
                                sents, ) + tags[gold_tag] + (
                                    head, ) + rels[rel] + tuple(srl_tags)

            # Expand sentences into one example per predicate
            if self.one_example_per_predicate:
                # grab the sent
                # should be sent_len x sent_elements
                sent = np.array(buff[i])
                # print(sent)
                is_predicate_idx = 4
                srl_start_idx = 10
                word_part = sent[:, 0].astype('O')
                srl_part = sent[:, srl_start_idx:].astype(np.int32)
                rest_part = sent[:, 1:srl_start_idx].astype(np.int32)
                # print("orig sent (%d):" % len(predicate_indices), sent[:, :8+len(predicate_indices)])
                # print("orig preds:", [map(lambda x: srls[int(x)], t) for t in sent[:, srl_start_idx:srl_start_idx+len(predicate_indices)]])
                if predicate_indices:
                    for k, p_idx in enumerate(predicate_indices):
                        # should be sent_len x sent_elements
                        rest_part[:, is_predicate_idx -
                                  1] = predicates["False"][0]
                        rest_part[p_idx,
                                  is_predicate_idx - 1] = predicates["True"][0]
                        correct_srls = srl_part[:, k]
                        new_sent = np.concatenate([
                            np.expand_dims(word_part, -1), rest_part,
                            np.expand_dims(correct_srls, -1)
                        ],
                                                  axis=1)
                        buff2.append(new_sent)
                        # print("new sent:", new_sent)
                        # print("new preds:", map(lambda x: srls[int(x)], new_sent[:, -1]))
                        # tokens_str = ' '.join(word_part)
                        # labels_str = ' '.join(map(lambda x: srls[x], correct_srls))
                        ## idx, tokens, labels
                        # print("%d %s ||| %s" % (p_idx, tokens_str, labels_str), file=tmp_f)
                        total_predicates += 1
                        examples += 1
                else:
                    new_sent = np.concatenate(
                        [np.expand_dims(word_part, -1), rest_part], axis=1)
                    buff2.append(new_sent)
                    examples += 1
            # else:
            #   buff2.append(np.concatenate[np.expand_dims(word_part, -1), rest_part, srl_part], axis=1) #(sent[0],) + map(int, sent[1:]))
            #   examples += 1
        # tmp_f.close()
        if self.one_example_per_predicate:
            print(
                "Loaded %d sentences with %d tokens, %d examples (%d predicates) (%s)"
                % (sents, toks, examples, total_predicates, self.name))
            return buff2
        else:
            print("Loaded %d sentences with %d tokens (%s)" %
                  (sents, toks, self.name))
            return buff

    #=============================================================
    def reset(self, sizes):
        """"""

        self._data = []
        self._targets = []
        self._metabucket.reset(sizes)
        return

    #=============================================================
    def rebucket(self):
        """"""

        buff = self._file_iterator.next()
        len_cntr = Counter()

        for sent in buff:
            len_cntr[len(sent)] += 1
        self.reset(KMeans(self.n_bkts, len_cntr).splits)

        for sent in buff:
            self._metabucket.add(sent)
        self._finalize()
        return

    #=============================================================
    def _finalize(self):
        """"""

        self._metabucket._finalize()
        return

    #=============================================================
    def get_minibatches(self,
                        batch_size,
                        input_idxs,
                        target_idxs,
                        shuffle=True):
        """"""

        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            feed_dict = {}
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))

            # np.set_printoptions(threshold=np.nan)
            # print("maxlen", maxlen)
            # print("maxlen+max(target_idxs)", maxlen+max(target_idxs))
            # print("data.shape[2]", data.shape[2])
            # targets = data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1]
            # print("data shape", targets.shape)
            # print("data[:,:,3:] shape", targets[:,:,3:].shape)

            feed_dict.update({
                self.inputs:
                data[:, :maxlen, input_idxs],
                self.targets:
                data[:, :maxlen,
                     min(target_idxs):maxlen + max(target_idxs) + 1]
            })
            yield feed_dict, sents

    #=============================================================
    @property
    def n_bkts(self):
        if self._train:
            return super(Dataset, self).n_bkts
        else:
            return super(Dataset, self).n_valid_bkts

    #=============================================================
    def __getitem__(self, key):
        return self._metabucket[key]

    def __len__(self):
        return len(self._metabucket)
Beispiel #5
0
class Dataset(Configurable):
    """"""

    #=============================================================
    def __init__(self, filename, vocabs, builder, *args, **kwargs):
        """"""

        super(Dataset, self).__init__(*args, **kwargs)
        self._file_iterator = self.file_iterator(filename)
        self._train = (filename == self.train_file)
        self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
        self._data = None
        self.vocabs = vocabs
        self.rebucket()

        self.inputs = tf.placeholder(dtype=tf.int32,
                                     shape=(None, None, None),
                                     name='inputs')
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=(None, None, None),
                                      name='targets')
        self.builder = builder()

    #=============================================================
    def file_iterator(self, filename):
        """"""

        with open(filename) as f:
            if self.lines_per_buffer > 0:
                buff = [[]]
                while True:
                    line = f.readline()
                    while line:
                        line = line.strip().split()
                        if line:
                            buff[-1].append(line)
                        else:
                            if len(buff) < self.lines_per_buffer:
                                if buff[-1]:
                                    buff.append([])
                            else:
                                break
                        line = f.readline()
                    if not line:
                        f.seek(0)
                    else:
                        buff = self._process_buff(buff)
                        yield buff
                        line = line.strip().split()
                        if line:
                            buff = [[line]]
                        else:
                            buff = [[]]
            else:
                buff = [[]]
                for line in f:
                    line = line.strip().split()
                    if line:
                        buff[-1].append(line)
                    else:
                        if buff[-1]:
                            buff.append([])
                if buff[-1] == []:
                    buff.pop()
                buff = self._process_buff(buff)
                while True:
                    yield buff

    #=============================================================
    def _process_buff(self, buff):
        """"""

        words, tags, rels = self.vocabs
        for i, sent in enumerate(buff):
            for j, token in enumerate(sent):
                word, tag1, tag2, head, rel = token[words.conll_idx], token[
                    tags.conll_idx[0]], token[
                        tags.conll_idx[1]], token[6], token[rels.conll_idx]
                buff[i][j] = (
                    word, ) + words[word] + tags[tag1] + tags[tag2] + (
                        int(head), ) + rels[rel]
            sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT,
                            Vocab.ROOT, 0, Vocab.ROOT))
        return buff

    #=============================================================
    def reset(self, sizes):
        """"""

        self._data = []
        self._targets = []
        self._metabucket.reset(sizes)
        return

    #=============================================================
    def rebucket(self):
        """"""

        buff = self._file_iterator.next()
        len_cntr = Counter()

        for sent in buff:
            len_cntr[len(sent)] += 1
        self.reset(KMeans(self.n_bkts, len_cntr).splits)

        for sent in buff:
            self._metabucket.add(sent)
        self._finalize()
        return

    #=============================================================
    def _finalize(self):
        """"""

        self._metabucket._finalize()
        return

    #=============================================================
    def get_minibatches(self,
                        batch_size,
                        input_idxs,
                        target_idxs,
                        shuffle=True):
        """"""

        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            feed_dict = {}
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))
            feed_dict.update({
                self.inputs: data[:, :maxlen, input_idxs],
                self.targets: data[:, :maxlen, target_idxs]
            })
            yield feed_dict, sents

    def get_minibatches_gemb_train(self,
                                   batch_size,
                                   input_idxs,
                                   target_idxs,
                                   shuffle=True):
        """"""

        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            feed_dict = {}
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            unk_id = self.vocabs[0]._str2idx['<UNK>']

            #no_oov = [i for i,s in enumerate(data[:,:,0]) if unk_id not in s]
            no_oov = list(range(len(sents)))

            # check for empty batch
            if len(no_oov) == 0:
                continue

            data_no_oov = data[no_oov]
            maxlen = np.max(np.sum(np.greater(data_no_oov[:, :, 0], 0),
                                   axis=1))
            if maxlen <= 2:
                continue
            oov_pos = np.random.randint(1, maxlen - 1,
                                        size=len(no_oov))  # one oov each sent

            feed_dict.update({
                self.inputs: data_no_oov[:, :maxlen, input_idxs],
                self.targets: data_no_oov[:, :maxlen, target_idxs]
            })
            yield feed_dict, oov_pos, sents[no_oov]

    def get_minibatches_gemb_test(self,
                                  batch_size,
                                  input_idxs,
                                  target_idxs,
                                  shuffle=True):
        """"""
        batch_size = 1  # must be 1 during testing
        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            feed_dict = {}
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))
            unk_id = self.vocabs[0]._str2idx['<UNK>']

            oov_pos = [np.where(s == unk_id) for s in sents]
            oov_pos = np.squeeze( oov_pos + \
                              np.reshape(oov_pos.shape[1] * \
                              np.arange(oov_pos.shape[0]), [-1,1]) )

            feed_dict.update({
                self.inputs: data[:, :maxlen, input_idxs],
                self.targets: data[:, :maxlen, target_idxs]
            })
            yield feed_dict, oov_pos, sents

    #=============================================================
    @property
    def n_bkts(self):
        if self._train:
            return super(Dataset, self).n_bkts
        else:
            return super(Dataset, self).n_valid_bkts

    #=============================================================
    def __getitem__(self, key):
        return self._metabucket[key]

    def __len__(self):
        return len(self._metabucket)
Beispiel #6
0
class Dataset(Configurable):
    """"""

    #=============================================================
    def __init__(self, filename, vocabs, builder, *args, **kwargs):
        """"""

        super(Dataset, self).__init__(*args, **kwargs)
        self._file_iterator = self.file_iterator(filename)
        self._train = (filename == self.train_file)
        self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
        self._data = None
        self.vocabs = vocabs
        self.rebucket()

        self.inputs = tf.placeholder(dtype=tf.int32,
                                     shape=(None, None, None),
                                     name='inputs')
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=(None, None),
                                      name='targets')
        self.sntmod = tf.placeholder(dtype=tf.float32,
                                     shape=(None, 3),
                                     name='sntmod')

        self.builder = builder()

    #=============================================================
    def file_iterator(self, filename):
        """"""

        with open(filename) as f:
            if self.lines_per_buffer > 0:
                buff = [[]]
                while True:
                    line = f.readline()
                    while line:
                        line = line.strip().split()
                        if line:
                            buff[-1].append(line)
                        else:
                            if len(buff) < self.lines_per_buffer:
                                if buff[-1]:
                                    buff.append([])
                            else:
                                break
                        line = f.readline()
                    if not line:
                        f.seek(0)
                    else:
                        buff = self._process_buff(buff)
                        yield buff
                        line = line.strip().split()
                        if line:
                            buff = [[line]]
                        else:
                            buff = [[]]
            else:
                buff = [[]]
                for line in f:
                    line = line.strip().split()
                    if line:
                        buff[-1].append(line)
                    else:
                        if buff[-1]:
                            buff.append([])
                if buff[-1] == []:
                    buff.pop()
                buff = self._process_buff(buff)
                while True:
                    yield buff

    #=============================================================
    def _process_buff(self, buff):
        """"""

        words, tags = self.vocabs
        for i, sent in enumerate(buff):
            targetflag = 0
            for j, token in enumerate(sent):
                if token[2] != 'o':
                    targetflag = 1
                word, tag, istarget, bftarget, aftarget, sentmod = token[
                    0], token[1], 0 if token[2] == 'o' else 1, 1 if token[
                        2] == 'o' and targetflag == 0 else 0, 1 if token[
                            2] == 'o' and targetflag == 1 else 0, self.getmood(
                                token[2])
                buff[i][j] = (word, ) + words[word] + tags[tag] + (
                    int(istarget), ) + (int(bftarget), ) + (
                        int(aftarget), ) + (sentmod, )
        return buff

    #=============================================================
    def getmood(self, polority):
        """"""
        if polority == 'o':
            return 0
        else:
            polority = polority.split('-')[1]
            if polority == 'positive':
                return 2
            elif polority == 'negative':
                return 4
            else:
                return 6

    #=============================================================
    def reset(self, sizes):
        """"""

        self._data = []
        self._targets = []
        self._metabucket.reset(sizes)
        return

    #=============================================================
    def rebucket(self):
        """"""

        buff = self._file_iterator.next()
        len_cntr = Counter()

        for sent in buff:
            len_cntr[len(sent)] += 1
        self.reset(KMeans(self.n_bkts, len_cntr).splits)

        for sent in buff:
            self._metabucket.add(sent)
        self._finalize()
        return

    #=============================================================
    def _finalize(self):
        """"""

        self._metabucket._finalize()
        return

    #=============================================================
    def get_minibatches(self,
                        batch_size,
                        input_idxs,
                        target_idxs,
                        shuffle=True):
        """"""

        minibatches = []
        for bkt_idx, bucket in enumerate(self._metabucket):
            if batch_size == 0:
                n_splits = 1
            #elif not self.minimize_pads:
            #  n_splits = max(len(bucket) // batch_size, 1)
            #  if bucket.size > 100:
            #    n_splits *= 2
            else:
                n_tokens = len(bucket) * bucket.size
                n_splits = max(n_tokens // batch_size, 1)
            if shuffle:
                range_func = np.random.permutation
            else:
                range_func = np.arange
            arr_sp = np.array_split(range_func(len(bucket)), n_splits)
            for bkt_mb in arr_sp:
                if len(bkt_mb) > 0:
                    minibatches.append((bkt_idx, bkt_mb))
        if shuffle:
            np.random.shuffle(minibatches)
        for bkt_idx, bkt_mb in minibatches:
            data = self[bkt_idx].data[bkt_mb]
            sents = self[bkt_idx].sents[bkt_mb]
            sntmodp = self[bkt_idx].smod[bkt_mb]
            maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1))
            # print("[tlog] maxlen\n"+str(maxlen))

            feed_dict = {
                self.inputs: data[:, :maxlen, input_idxs],
                self.targets: data[:, :maxlen, target_idxs],
                self.sntmod: sntmodp
            }
            yield feed_dict, sents

    #=============================================================
    @property
    def n_bkts(self):
        if self._train:
            return super(Dataset, self).n_bkts
        else:
            return super(Dataset, self).n_valid_bkts

    #=============================================================
    def __getitem__(self, key):
        return self._metabucket[key]

    def __len__(self):
        return len(self._metabucket)
Beispiel #7
0
class Dataset(Configurable):
  """"""
  
  #=============================================================
  def __init__(self, filename, vocabs, builder, *args, **kwargs):
    """"""
    
    super(Dataset, self).__init__(*args, **kwargs)
    self._file_iterator = self.file_iterator(filename)
    self._train = (filename == self.train_file)
    self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
    self._data = None
    self.vocabs = vocabs
    self.rebucket()
    
    self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs')
    self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets')
    self.builder = builder()
  
  #=============================================================
  def file_iterator(self, filename):
    """"""
    
    with open(filename) as f:
      if self.lines_per_buffer > 0:
        buff = [[]]
        while True:
          line = f.readline()
          while line:
            line = line.strip().split()
            if line and line[0] != '#':
              buff[-1].append(line)
            else:
              if len(buff) < self.lines_per_buffer:
                if buff[-1]:
                  buff.append([])
              else:
                break
            line = f.readline()
          if not line:
            f.seek(0)
          else:
            buff = self._process_buff(buff)
            yield buff
            line = line.strip().split()
            if line and line[0] != '#':
              buff = [[line]]
            else:
              buff = [[]]
      else:
        buff = [[]]
        for line in f:
          line = line.strip().split()
          if line and line[0] != '#':
            buff[-1].append(line)
          else:
            if buff[-1]:
              buff.append([])
        if buff[-1] == []:
          buff.pop()
        buff = self._process_buff(buff)
        while True:
          yield buff
  
  #=============================================================
  def _process_buff(self, buff):
    """"""
    
    words, tags, rels = self.vocabs
    for i, sent in enumerate(buff):
      new_sent = [None]*(len(sent)+1)
      for j, token in enumerate(sent):
        try:
          word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]
          if head.isdigit():
            new_sent[int(token[0])] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel]
        except Exception as e:
          print(token)
          raise(e)
      new_sent[0] = ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT)
      buff[i] = [x for x in new_sent if x is not None]
    return buff
  
  #=============================================================
  def reset(self, sizes):
    """"""
    
    self._data = []
    self._targets = []
    self._metabucket.reset(sizes)
    return
  
  #=============================================================
  def rebucket(self):
    """"""
    
    buff = self._file_iterator.next()
    len_cntr = Counter()
    
    for sent in buff:
      len_cntr[len(sent)] += 1
    self.reset(KMeans(self.n_bkts, len_cntr).splits)
    
    for sent in buff:
      self._metabucket.add(sent)
    self._finalize()
    return
  
  #=============================================================
  def _finalize(self):
    """"""
    
    self._metabucket._finalize()
    return
  
  #=============================================================
  def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True):
    """"""
    
    minibatches = []
    for bkt_idx, bucket in enumerate(self._metabucket):
      if batch_size == 0:
        n_splits = 1
      else:
        n_tokens = len(bucket) * bucket.size
        n_splits = max(n_tokens // batch_size, 1)
      if shuffle:
        range_func = np.random.permutation
      else:
        range_func = np.arange
      arr_sp = np.array_split(range_func(len(bucket)), n_splits)
      for bkt_mb in arr_sp:
        minibatches.append( (bkt_idx, bkt_mb) )
    if shuffle:
      np.random.shuffle(minibatches)
    for bkt_idx, bkt_mb in minibatches:
      feed_dict = {}
      data = self[bkt_idx].data[bkt_mb]
      sents = self[bkt_idx].sents[bkt_mb]
      maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1))
      feed_dict.update({
        self.inputs: data[:,:maxlen,input_idxs],
        self.targets: data[:,:maxlen,target_idxs]
      })
      yield feed_dict, sents
  
  #=============================================================
  @property
  def n_bkts(self):
    if self._train:
      return super(Dataset, self).n_bkts
    else:
      return super(Dataset, self).n_valid_bkts
  
  #=============================================================
  def __getitem__(self, key):
    return self._metabucket[key]
  def __len__(self):
    return len(self._metabucket)
Beispiel #8
0
class Dataset(Configurable):
  """"""
  
  #=============================================================
  def __init__(self, filename, vocabs, builder, *args, **kwargs):
    """"""
    
    super(Dataset, self).__init__(*args, **kwargs)
    self.vocabs = vocabs

    self.train_domains_set = set(self.train_domains.split(',')) if self.train_domains != '-' and self.name == "Trainset" else set()
    print("Loading training data from domains:", self.train_domains_set if self.train_domains_set else "all")

    self._file_iterator = self.file_iterator(filename)
    self._train = (filename == self.train_file)
    self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts)
    self._data = None
    self.rebucket()

    self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs')
    self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets')
    self.builder = builder()
  
  #=============================================================
  def file_iterator(self, filename):
    """"""
    
    with open(filename) as f:
      if self.lines_per_buffer > 0:
        buff = [[]]
        while True:
          line = f.readline()
          while line:
            line = line.strip().split()
            if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains):
              buff[-1].append(line)
            else:
              if len(buff) < self.lines_per_buffer:
                if len(buff[-1]) > 0:
                  buff.append([])
                else:
                  buff[-1] = []
              else:
                break
            line = f.readline()
          if not line:
            f.seek(0)
          else:
            buff = self._process_buff(buff)
            yield buff
            line = line.strip().split()
            if line:
              buff = [[line]]
            else:
              buff = [[]]
      else:
        buff = [[]]
        for line in f:
          line = line.strip().split()
          if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains):
            buff[-1].append(line)
          else:
            if len(buff[-1]) > 0:
              buff.append([])
            else:
              buff[-1] = []
        if buff[-1] == []:
          buff.pop()
        buff = self._process_buff(buff)
        while True:
          yield buff
  
  #=============================================================
  def _process_buff(self, buff):
    """"""
    
    words, tags, rels, srls, trigs, domains = self.vocabs
    srl_start_field = srls.conll_idx[0]
    sents = 0
    toks = 0
    for i, sent in enumerate(buff):
      # if not self.conll2012 or (self.conll2012 and len(list(sent)) > 1):
      # print(sent, len(sent))
      sents += 1
      sent_len = len(sent)
      num_fields = len(sent[0])
      srl_take_indices = [idx for idx in range(srl_start_field, srl_start_field + sent_len) if idx < num_fields - 1 and (self.train_on_nested or np.all(['/' not in sent[j][idx] for j in range(sent_len)]))]
      for j, token in enumerate(sent):
        toks += 1
        if self.conll:
          word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]
          if rel == 'root':
            head = j
          else:
            head = int(head) - 1
          buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (head,) + rels[rel]
        elif self.conll2012:
          word, auto_tag, gold_tag, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]
          domain = token[0].split('/')[0]
          # print(word, tag1, tag2, head, rel)
          if rel == 'root':
            head = j
          else:
            head = int(head) - 1

          # srl_fields = [token[idx] if idx < len(token)-1 else 'O' for idx in range(srl_start_field, srl_start_field + sent_len)]
          srl_fields = [token[idx] for idx in srl_take_indices] # todo can we use fancy indexing here?
          srl_fields += ['O'] * (sent_len - len(srl_take_indices)) #np.any([s in self.trigger_indices for s in srl_tags])
          srl_tags = [srls[s][0] for s in srl_fields]

          if self.joint_pos_predicates:
            is_trigger = token[trigs.conll_idx[0]] != '-' and (self.train_on_nested or self.trigger_str in srl_fields)
            trigger_str = str(is_trigger) + '/' + gold_tag
          else:
            is_trigger = token[trigs.conll_idx] != '-' and (self.train_on_nested or self.trigger_str in srl_fields)
            trigger_str = str(is_trigger)

          buff[i][j] = (word,) + words[word] + tags[auto_tag] + trigs[trigger_str] + domains[domain] + tags[gold_tag] + (head,) + rels[rel] + tuple(srl_tags)
        # sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT))
    print("Loaded %d sentences with %d tokens (%s)" % (sents, toks, self.name))
    return buff
  
  #=============================================================
  def reset(self, sizes):
    """"""
    
    self._data = []
    self._targets = []
    self._metabucket.reset(sizes)
    return
  
  #=============================================================
  def rebucket(self):
    """"""

    buff = self._file_iterator.next()
    len_cntr = Counter()
    
    for sent in buff:
      len_cntr[len(sent)] += 1
    self.reset(KMeans(self.n_bkts, len_cntr).splits)
    
    for sent in buff:
      self._metabucket.add(sent)
    self._finalize()
    return
  
  #=============================================================
  def _finalize(self):
    """"""
    
    self._metabucket._finalize()
    return
  
  #=============================================================
  def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True):
    """"""
    
    minibatches = []
    for bkt_idx, bucket in enumerate(self._metabucket):
      if batch_size == 0:
        n_splits = 1
      else:
        n_tokens = len(bucket) * bucket.size
        n_splits = max(n_tokens // batch_size, 1)
      if shuffle:
        range_func = np.random.permutation
      else:
        range_func = np.arange
      arr_sp = np.array_split(range_func(len(bucket)), n_splits)
      for bkt_mb in arr_sp:
        minibatches.append( (bkt_idx, bkt_mb) )
    if shuffle:
      np.random.shuffle(minibatches)
    for bkt_idx, bkt_mb in minibatches:
      feed_dict = {}
      data = self[bkt_idx].data[bkt_mb]
      sents = self[bkt_idx].sents[bkt_mb]
      maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1))
      np.set_printoptions(threshold=np.nan)

      # print("maxlen", maxlen)
      # print("maxlen+max(target_idxs)", maxlen+max(target_idxs))
      # print("data.shape[2]", data.shape[2])
      # targets = data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1]
      # print("data shape", targets.shape)
      # print("data[:,:,3:] shape", targets[:,:,3:].shape)

      feed_dict.update({
        self.inputs: data[:,:maxlen,input_idxs],
        self.targets: data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1]
      })
      yield feed_dict, sents
  
  #=============================================================
  @property
  def n_bkts(self):
    if self._train:
      return super(Dataset, self).n_bkts
    else:
      return super(Dataset, self).n_valid_bkts
  
  #=============================================================
  def __getitem__(self, key):
    return self._metabucket[key]
  def __len__(self):
    return len(self._metabucket)