Ejemplo n.º 1
0
def conll_data_generator(filenames, data_config):
  for filename in filenames:
    with open(filename, 'r') as f:
      sents = 0
      toks = 0
      buf = []
      for line in f:
        line = line.strip()
        if line:
          toks += 1
          split_line = line.split()
          data_vals = []
          for d in data_config.keys():
            # only return the data that we're actually going to use as inputs or outputs
            if ('feature' in data_config[d] and data_config[d]['feature']) or \
               ('label' in data_config[d] and data_config[d]['label']):
              datum_idx = data_config[d]['conll_idx']
              converter_name = data_config[d]['converter']['name'] if 'converter' in data_config[d] else 'default_converter'
              converter_params = data_converters.get_params(data_config[d], split_line, datum_idx)
              data = data_converters.dispatch(converter_name)(**converter_params)
              data_vals.extend(data)
          # print(tuple(data_vals))
          buf.append(tuple(data_vals))
        else:
          if buf:
            sents += 1
            yield buf
            buf = []
          # print()
      # catch the last one
      if buf:
        yield buf
Ejemplo n.º 2
0
def conll_data_generator(filenames, data_config):
    """
    Read CoNLL formated @filenames files. Yields each sentence.
    Select columns defined in @data_config. data_config optionaly specify
    converters.
    """
    for filename in filenames:
        with open(filename, 'r') as f:
            sents = 0
            toks = 0
            buf = []
            for line in f:
                line = line.strip()
                if line:
                    toks += 1
                    split_line = line.split()
                    data_vals = []
                    for d in data_config.keys():
                        # only return the data that we're actually going to use as inputs or outputs
                        if (('feature' in data_config[d]
                             and data_config[d]['feature'])
                                or ('label' in data_config[d]
                                    and data_config[d]['label'])):

                            datum_idx = data_config[d]['conll_idx']
                            converter_name = data_config[d]['converter'][
                                'name'] if 'converter' in data_config[
                                    d] else 'default_converter'
                            converter_params = data_converters.get_params(
                                data_config[d], split_line, datum_idx)
                            tf.logging.log(
                                tf.logging.INFO,
                                f"conll_data_generator dispatching for {d}: "
                                f"{converter_name}, "
                                f"{converter_params}")
                            data = data_converters.dispatch(converter_name)(
                                **converter_params)
                            data_vals.extend(data)
                    # print(tuple(data_vals))
                    buf.append(tuple(data_vals))
                else:
                    if buf:
                        sents += 1
                        tf.logging.log(
                            tf.logging.INFO,
                            f"data_generator.conll_data_generator "
                            f"yielding buf: {buf}: ")
                        yield buf
                        buf = []
            # catch the last one
            if buf:
                tf.logging.log(
                    tf.logging.INFO, f"data_generator.conll_data_generator "
                    f"yielding last buf from {filename}: {buf}: ")
                yield buf
Ejemplo n.º 3
0
def conll_data_generator(filenames, data_config):
  # print("debug <processing input data using config>: ", data_config)
  for filename in filenames:
    with open(filename, 'r') as f:
      sents = 0
      toks = 0
      buf = []
      for line in f:
        line = line.strip()
        # print("debug <input line>: ", line)
        if line:
          # if toks <20:
          # print("debug <input line>: ", line)
          toks += 1
          split_line = line.split()
          data_vals = []
          for d in data_config.keys():
            # only return the data that we're actually going to use as inputs or outputs
            if ('feature' in data_config[d] and data_config[d]['feature']) or \
               ('label' in data_config[d] and data_config[d]['label']):
              datum_idx = data_config[d]['conll_idx']
              converter_name = data_config[d]['converter']['name'] if 'converter' in data_config[d] else 'default_converter'
              converter_params = data_converters.get_params(data_config[d], split_line, datum_idx)
              # print(datum_idx, converter_name)
              # try:
              data = data_converters.dispatch(converter_name)(**converter_params)
              # except Exception as e:
              #   print(e)
              #   print("debug <converter_name, converter_params>", converter_name, '\n',converter_params)
              data_vals.extend(data)
          # if toks < 30:
            # print("debug <data_vals>: ",tuple(data_vals))
          # print("debug <data_vals>:", tuple(data_vals))
          buf.append(tuple(data_vals))

        else:
          if buf:
            sents += 1
            yield buf
            buf = []
          # print()
      # catch the last one
      if buf:
        yield buf
Ejemplo n.º 4
0
    def create_load_or_update_vocab_files(self,
                                          data_config,
                                          save_dir,
                                          filenames=None,
                                          update_only=False):

        # init maps
        vocabs = []
        vocabs_index = {}
        for d in data_config:
            updatable = 'updatable' in data_config[d] and data_config[d][
                'updatable']
            if 'vocab' in data_config[d] and data_config[d]['vocab'] == d and (
                    updatable or not update_only):
                this_vocab = {}
                if update_only and updatable and d in self.vocab_maps:
                    this_vocab = self.vocab_maps[d]
                vocabs.append(this_vocab)
                vocabs_index[d] = len(vocabs_index)

        # Create vocabs from data files
        if filenames:
            for filename in filenames:
                with open(filename, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            split_line = line.split()
                            for d in vocabs_index.keys():
                                datum_idx = data_config[d]['conll_idx']
                                this_vocab_map = vocabs[vocabs_index[d]]
                                converter_name = data_config[d]['converter'][
                                    'name'] if 'converter' in data_config[
                                        d] else 'default_converter'
                                converter_params = data_converters.get_params(
                                    data_config[d], split_line, datum_idx)
                                this_data = data_converters.dispatch(
                                    converter_name)(**converter_params)
                                for this_datum in this_data:
                                    if this_datum not in this_vocab_map:
                                        this_vocab_map[this_datum] = 0
                                    this_vocab_map[this_datum] += 1

        # Assume we have the vocabs saved to disk; load them
        else:
            for d in vocabs_index.keys():
                this_vocab_map = vocabs[vocabs_index[d]]
                with open("%s/%s.txt" % (save_dir, d), 'r') as f:
                    for line in f:
                        datum, count = line.strip().split()
                        this_vocab_map[datum] = int(count)

        # build reverse_maps, joint_label_lookup_maps
        for v in vocabs_index.keys():

            # build reverse_lookup map, from int -> string
            this_counts_map = vocabs[vocabs_index[v]]
            this_map = dict(
                zip(this_counts_map.keys(),
                    range(len(this_counts_map.keys()))))
            reverse_map = dict(
                zip(range(len(this_counts_map.keys())),
                    this_counts_map.keys()))
            self.oovs[v] = False
            if 'oov' in self.data_config[v] and self.data_config[v]['oov']:
                self.oovs[v] = True
                # reverse_map[len(reverse_map)] = constants.OOV_STRING
                # this_map[len(this_map)] = constants.OOV_STRING
            self.reverse_maps[v] = reverse_map
            self.vocab_maps[v] = this_map

            # check whether we need to build joint_label_lookup_map
            if 'label_components' in self.data_config[v]:
                joint_vocab_map = vocabs[vocabs_index[v]]
                label_components = self.data_config[v]['label_components']
                component_keys = [
                    vocabs[vocabs_index[d]].keys() for d in label_components
                ]
                component_maps = [
                    dict(zip(comp_keys, range(len(comp_keys))))
                    for comp_keys in component_keys
                ]
                map_names = [
                    "%s_to_%s" % (v, label_comp)
                    for label_comp in label_components
                ]
                joint_to_comp_maps = [
                    np.zeros([len(joint_vocab_map), 1], dtype=np.int32)
                    for _ in label_components
                ]
                for joint_idx, joint_label in enumerate(
                        joint_vocab_map.keys()):
                    split_label = joint_label.split(constants.JOINT_LABEL_SEP)
                    for label_comp, comp_map, joint_to_comp_map in zip(
                            split_label, component_maps, joint_to_comp_maps):
                        comp_idx = comp_map[label_comp]
                        joint_to_comp_map[joint_idx] = comp_idx

                # add them to the master map
                for map_name, joint_to_comp_map in zip(map_names,
                                                       joint_to_comp_maps):
                    self.joint_label_lookup_maps[map_name] = joint_to_comp_map

        for d in vocabs_index.keys():
            this_vocab_map = vocabs[vocabs_index[d]]
            with open("%s/%s.txt" % (save_dir, d), 'w') as f:
                for k, v in this_vocab_map.items():
                    print("%s\t%d" % (k, v), file=f)

        return {k: len(vocabs[vocabs_index[k]]) for k in vocabs_index.keys()}