Example #1
0
def preprocess(config):
    from tensorflow import python_io as pio
    import os
    from random import random

    data_dir = config.pretrain_data_dir
    word2id = json.load(open(os.path.join(data_dir, 'word2id.json')))
    slen = config.toks_per_sent

    numval, numtrain = 0, 0

    with pio.TFRecordWriter(os.path.join(data_dir, 'train.tfrecord'),
                            options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \
      pio.TFRecordWriter(os.path.join(data_dir, 'val.tfrecord'),
                         options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter:
        jdata = json.load(open(os.path.join(data_dir, 'data.json')))
        for example in tqdm(jdata, total=len(jdata)):
            tokens = [word2id['[CLS]']] + example['s1']['tokens'] + [word2id['[SEP]']] \
                + example['s2']['tokens'] + [word2id['[SEP]']]

            s1_len = len(example['s1']['tokens'])
            s2_len = len(example['s2']['tokens'])
            s1_labels, s1_mask = _labels_and_mask(example['s1']['labels'],
                                                  s1_len)
            s2_labels, s2_mask = _labels_and_mask(example['s2']['labels'],
                                                  s2_len)
            a_mask = [0.] + [1.] * s1_len + [0.] * (2 + s2_len)
            b_mask = [0.] * (2 + s1_len) + [1.] * s2_len + [0.]
            lm_labels = [0] + s1_labels + [0] + s2_labels + [0]
            lm_mask = [0.] + s1_mask + [0.] + s2_mask + [0.]

            features = {
                'label': _int64_feature([example['label']]),
                'tokens': _padded_int64_feature(tokens, slen),
                'a_mask': _padded_float_feature(a_mask, slen),
                'b_mask': _padded_float_feature(b_mask, slen),
                'lm_labels': _padded_int64_feature(lm_labels, slen),
                'lm_mask': _padded_float_feature(lm_mask, slen)
            }

            if random() > 0.05:
                numtrain += 1
                twriter.write(
                    tf.train.Example(features=tf.train.Features(
                        feature=features)).SerializeToString())
            else:
                numval += 1
                vwriter.write(
                    tf.train.Example(features=tf.train.Features(
                        feature=features)).SerializeToString())
    print('Done! Created %d training examples and %d validation examples' %
          (numtrain, numval))
Example #2
0
def read_minst_from_tfrecords(filequeue, shape, one_hot=0, GZ=True):
    # Note: if num_epochs is not None, this function creates local counter epochs.
    # Use local_variables_initializer() to initialize local variables.
    # refer to document of string_input_producer().
    options = None
    if GZ:
        options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
    reader = tf.TFRecordReader(options=options)
    _, serialized_example = reader.read(filequeue)
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
            # 'height': tf.FixedLenFeature([], tf.int64),
            # 'width': tf.FixedLenFeature([], tf.int64),
            # 'depth': tf.FixedLenFeature([], tf.int64)
        })

    image = tf.decode_raw(features['image_raw'], tf.float32)
    image.set_shape([shape])
    label = tf.cast(features['label'], tf.int32)
    if one_hot > 0:
        label = tf.one_hot(label, one_hot)
    return image, label
Example #3
0
def convert_to_tfrecords(file, data, height, width, depth, GZ=True):
    '''
    :param data: must be a numpy array with two dimension, data[0] contains image data, data[1] for labels.
    '''
    images = data[0]
    labels = data[1]
    num_ex = np.size(data[1])

    filename = file + '.tfrecords'
    options = None
    if GZ:
        filename += '.gz'
        options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
    print('Writing ', filename)
    writer = tf.python_io.TFRecordWriter(filename, options=options)

    for i in range(num_ex):
        print(i)
        image_raw = images[i].tostring()
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'height': _int64_features(height),
                'width': _int64_features(width),
                'depth': _int64_features(depth),
                'label': _int64_features(int(labels[i])),
                'image_raw': _bytes_features(image_raw)
            }))
        writer.write(example.SerializeToString())
    writer.close()
    print('finished')
Example #4
0
def convert_to_tfrecords(data, out_file, GZ=True):
    image = data[0]
    labels = data[1]
    count = np.size(labels)

    filename = out_file + '.tfrecords'
    options = None
    if GZ:
        filename += '.gz'
        options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
    if os.path.exists(filename):
        print("File %s exists" % filename)
        return
    print("writing to %s..." % filename)
    writer = tpio.TFRecordWriter(filename, options=options)

    for i in range(count):
        sys.stdout.write("%d\r" % (i + 1))
        image_raw = image[i].tostring()
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'label': _int64_features(labels[i]),
                'image_raw': _bytes_features(image_raw)
            }))
        writer.write(example.SerializeToString())
    writer.close()
    print("\nfinished")
Example #5
0
 def __enter__(self):
     if os.path.exists(self.output_file):
         raise IOError("file %s exists" % self.output_file)
     else:
         options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
         self.writer = tpio.TFRecordWriter(self.output_file,
                                           options=options)
     return self
Example #6
0
def preprocess(config):
  # concept
  # words  [batch_size, doc_len]
  # doclens [batch_size]

  # attr
  # sentence_idxs [batch_size, num_sentences, words_per_sentence]
  # sentence_lens [batch_size, num_sentences]
  # activity_idxs [batch_size, num_sentences, concepts_per_sentence]
  # activity_lens [batch_size, num_sentences]
  # concept_idxs [batch_size, num_sentences, concepts_per_sentence]
  # concept_lens [batch_size, num_sentences]

  # rel
  # concept_mention_idxs [batch_size, num_concept_mentions]
  # mentions_per_concept  [batch_size, global_concepts, global_concepts, mention_pairs, 2]
  #   last dim is [source mention, dest mention] index list
  # relation_mask [batch_size, global_concepts, global_concepts]

  # labels
  # boundary [batch_size, doc_len]
  # <attr_name> [batch_size, num_sentences, concepts_per_sentence]
  # relation [batch_size, global_concepts, global_concepts]

  from tensorflow import python_io as pio
  import os
  from random import random

  word2id = config.word2id
  activity_attributes = config.attr_info.activity_attributes
  common_attributes = config.attr_info.common_attributes

  train_file = os.path.join(config.record_dir, config.train_filename)
  val_file = os.path.join(config.record_dir, config.val_filename)

  num_train, num_val = 0, 0
  with pio.TFRecordWriter(train_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \
    pio.TFRecordWriter(val_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter:
    jdata = json.load(open(config.json_data))
    for json_doc in tqdm(jdata, total=len(jdata)):
      words = []
      slens = []
      word_idxs = np.zeros([config.sents_per_doc, config.toks_per_sent], int)
      activity_lens = []
      activity_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int)
      concept_lens = []
      concept_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int)
      boundary_labels = []
      attribute_labels = defaultdict(lambda: np.zeros([config.sents_per_doc, config.concepts_per_sent], int))
      sent_offset = 0

      for i, sentence in enumerate(json_doc['sentences'][:config.sents_per_doc]):
        swords = [word2id[word] for word in sentence['words'][:config.toks_per_sent]]
        slens.append(len(swords))
        words += swords
        boundary_labels += [config.boundary2id[b] for b in sentence['boundary_labels'][:config.toks_per_sent]]

        word_idxs[i, :slens[i]] = range(sent_offset, sent_offset + slens[i])
        sent_offset += slens[i]

        concepts = sentence['concepts'][:config.concepts_per_sent]
        alen = 0
        concept_lens.append(len(concepts))
        for j, (cidx, label_dict) in enumerate(concepts):
          if 'morphology' in label_dict:
            activity_idxs[i, alen] = cidx
            alen += 1
            for aname in activity_attributes:
              attribute_labels[aname][i, alen] = label_dict[aname]
          concept_idxs[i, j] = cidx
          for aname in common_attributes:
            attribute_labels[aname][i, j] = label_dict[aname]
        activity_lens.append(alen)
      words = words[:config.max_doc_len]
      doclen = len(words)

      # concept_mentions[i] = doc-level id of word corresponding to ith mention
      concept_mentions = np.zeros(config.concept_mentions_per_doc, int)
      # conc2mentions[cid] = [mid] list of mention ids (indexes into concept_mentions)
      conc2mentions = defaultdict(list)
      for i, (concept_id, mention_idxs) in enumerate(json_doc['relations']['mentions'].items()[:config.concept_mentions_per_doc]):
        if int(concept_id) < config.concepts_per_doc:
          for [sent_id, word_id] in mention_idxs:
            if sent_id < config.sents_per_doc and word_id < config.toks_per_sent:
              concept_mentions[i] = word_idxs[sent_id, word_id]
              conc2mentions[int(concept_id)].append(i)
      relation_mask = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      mentions_per_concept = np.zeros([config.concepts_per_doc, config.concepts_per_doc, config.mention_pairs, 2], int)
      mention_pairs_per_conc_pair = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      print('Found %d concepts with %d mentions' % (len(conc2mentions), sum([len(m) for m in conc2mentions.values()])))

      # populate relation mask, mention_pairs_per_conc_pair, and mentions_per_concept
      for head_cid, head_mentions in dict(conc2mentions).iteritems():
        for tail_cid, tail_mentions in dict(conc2mentions).iteritems():
          if head_cid != tail_cid:
            relation_mask[head_cid, tail_cid] = 1
            mention_pairs_per_conc_pair[head_cid, tail_cid] = \
              min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid]))
            for h, head_mention in enumerate(head_mentions):
              for t, tail_mention in enumerate(tail_mentions):
                if h + t >= config.mention_pairs:
                  break
                mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention]
      print('Mask members: %d' % int(np.sum(relation_mask)))

      relation_labels = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      # populate relation_labels
      for label, head_cid, tail_cid in json_doc['relations']['labels']:
        relation_labels[head_cid, tail_cid] = label + 1
        # relation_mask[head_cid, tail_cid] = 1
        # mention_pairs_per_conc_pair[head_cid, tail_cid] = \
        #   min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid]))
        # for h, head_mention in enumerate(conc2mentions[head_cid]):
        #   for t, tail_mention in enumerate(conc2mentions[tail_cid]):
        #     if h + t >= config.mention_pairs:
        #       break
        #     mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention]

      if len(conc2mentions) > 0:
        features = {
          'record_id': _bytes_feature(json_doc['id']),
          'words': _int64_feature(words + [0] * (config.max_doc_len - len(words))),
          'doclen': _int64_feature([doclen]),
          'sentence_idxs': _int64_feature(word_idxs.flatten().tolist()),
          'sentence_lens': _int64_feature(slens + [0] * (config.sents_per_doc - len(slens))),
          'activity_idxs': _int64_feature(activity_idxs.flatten().tolist()),
          'activity_lens': _int64_feature(activity_lens + [0] * (config.sents_per_doc - len(activity_lens))),
          'concept_idxs': _int64_feature(concept_idxs.flatten().tolist()),
          'concept_lens': _int64_feature(concept_lens + [0] * (config.sents_per_doc - len(concept_lens))),
          'concept_mention_idxs': _int64_feature(concept_mentions.tolist()),
          'mentions_per_concept': _int64_feature(mentions_per_concept.flatten().tolist()),
          'relation_mask': _int64_feature(relation_mask.flatten().tolist()),
          'mention_pairs_per_conc_pair': _int64_feature(mention_pairs_per_conc_pair.flatten().tolist()),
          'boundary_labels': _int64_feature(boundary_labels[:config.max_doc_len] +
                                            [config.boundary2id['O']] * (config.max_doc_len - len(boundary_labels))),
          'relation_labels': _int64_feature(relation_labels.flatten().tolist())
        }
        for attr_label, labels_matrix in attribute_labels.iteritems():
          features[attr_label] = _int64_feature(labels_matrix.flatten().tolist())

        example = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString()
        if random() < config.validation_proportion and num_val < 36:
          vwriter.write(example)
          num_val += 1
        else:
          twriter.write(example)
          num_train += 1
  print("Saved %d training examples" % num_train)
  print("Saved %d validation examples" % num_val)