def process_img_directories():
    test_writer = python_io.TFRecordWriter('./test.tfrecord')
    train_writer = python_io.TFRecordWriter('./train.tfrecord')

    folder_label, label_map = 0, {}
    ex_tst, ex_trn = 0, 0
    for folder in listdir(IMG_ROOT_DIR):
        folder_path = path.join(IMG_ROOT_DIR, folder)
        csv_data = load_csv(folder, folder_path)

        n_test, n_train = process_folder(folder_path, csv_data, folder_label,
                                         test_writer, train_writer)
        ex_tst, ex_trn = ex_tst + n_test, ex_trn + n_train

        label_map[folder] = folder_label
        folder_label += 1

    test_writer.close()
    train_writer.close()
    stdout.flush()

    print('-' * 80)
    print(label_map)
    print('-' * 80)
    print('Number of training examples: ', ex_trn)
    print('Number of testing examples: ', ex_tst)
Beispiel #2
0
def preprocess(config):
    from tensorflow import python_io as pio
    import os
    from random import random

    data_dir = config.pretrain_data_dir
    word2id = json.load(open(os.path.join(data_dir, 'word2id.json')))
    slen = config.toks_per_sent

    numval, numtrain = 0, 0

    with pio.TFRecordWriter(os.path.join(data_dir, 'train.tfrecord'),
                            options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \
      pio.TFRecordWriter(os.path.join(data_dir, 'val.tfrecord'),
                         options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter:
        jdata = json.load(open(os.path.join(data_dir, 'data.json')))
        for example in tqdm(jdata, total=len(jdata)):
            tokens = [word2id['[CLS]']] + example['s1']['tokens'] + [word2id['[SEP]']] \
                + example['s2']['tokens'] + [word2id['[SEP]']]

            s1_len = len(example['s1']['tokens'])
            s2_len = len(example['s2']['tokens'])
            s1_labels, s1_mask = _labels_and_mask(example['s1']['labels'],
                                                  s1_len)
            s2_labels, s2_mask = _labels_and_mask(example['s2']['labels'],
                                                  s2_len)
            a_mask = [0.] + [1.] * s1_len + [0.] * (2 + s2_len)
            b_mask = [0.] * (2 + s1_len) + [1.] * s2_len + [0.]
            lm_labels = [0] + s1_labels + [0] + s2_labels + [0]
            lm_mask = [0.] + s1_mask + [0.] + s2_mask + [0.]

            features = {
                'label': _int64_feature([example['label']]),
                'tokens': _padded_int64_feature(tokens, slen),
                'a_mask': _padded_float_feature(a_mask, slen),
                'b_mask': _padded_float_feature(b_mask, slen),
                'lm_labels': _padded_int64_feature(lm_labels, slen),
                'lm_mask': _padded_float_feature(lm_mask, slen)
            }

            if random() > 0.05:
                numtrain += 1
                twriter.write(
                    tf.train.Example(features=tf.train.Features(
                        feature=features)).SerializeToString())
            else:
                numval += 1
                vwriter.write(
                    tf.train.Example(features=tf.train.Features(
                        feature=features)).SerializeToString())
    print('Done! Created %d training examples and %d validation examples' %
          (numtrain, numval))
Beispiel #3
0
def convert_to_tfrecords(data, out_file, GZ=True):
    image = data[0]
    labels = data[1]
    count = np.size(labels)

    filename = out_file + '.tfrecords'
    options = None
    if GZ:
        filename += '.gz'
        options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
    if os.path.exists(filename):
        print("File %s exists" % filename)
        return
    print("writing to %s..." % filename)
    writer = tpio.TFRecordWriter(filename, options=options)

    for i in range(count):
        sys.stdout.write("%d\r" % (i + 1))
        image_raw = image[i].tostring()
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'label': _int64_features(labels[i]),
                'image_raw': _bytes_features(image_raw)
            }))
        writer.write(example.SerializeToString())
    writer.close()
    print("\nfinished")
Beispiel #4
0
 def __enter__(self):
     if os.path.exists(self.output_file):
         raise IOError("file %s exists" % self.output_file)
     else:
         options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP)
         self.writer = tpio.TFRecordWriter(self.output_file,
                                           options=options)
     return self
Beispiel #5
0
 def __init__(self, dataset, out_dir):
     self.dataset = dataset
     self.out_dir = out_dir
     self.tfr_writer = python_io.TFRecordWriter(
         os.path.join(out_dir, 'dataset.tfrecords'))
     self.image_dir = 'IMG'
     self.image_dir_path = os.path.join(out_dir, self.image_dir)
     ensure_dir(self.image_dir_path)
def run(output_dir, name='gdxray_train', shuffling=False):
    """Runs the conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
      output_dir: Output directory.
    """

    # Process dataset files.
    tf_train_filename = _get_output_filename(output_dir, 'train', name)
    tf_eval_filename = _get_output_filename(output_dir, 'eval', name)
    train_split, eval_split = _get_train_eval_split()

    with tf.Session() as sess:

        i = 0
        b = 0
        print("Writing train file to", tf_train_filename)
        with python_io.TFRecordWriter(tf_train_filename) as train_writer:
            for image in get_images():
                if len(image.boxes) and image.filename in train_split:
                    print('\r>> Adding image [%i]: %s' % (i, image.filename))
                    _add_to_tfrecord(sess, image, train_writer)
                    i += 1
                    b += len(image.boxes)

        print("Added {0} images with {1} bounding boxes".format(i, b))

        i = 0
        b = 0
        print("Writing eval file to", tf_eval_filename)
        with python_io.TFRecordWriter(tf_eval_filename) as eval_writer:
            for image in get_images():
                if image.filename in eval_split:
                    print('\r>> Adding image [%i]: %s' % (i, image.filename))
                    _add_to_tfrecord(sess, image, eval_writer)
                    i += 1
                    b += len(image.boxes)

        print("Added {0} images with {1} bounding boxes".format(i, b))

    print('\nFinished converting the GDXray Dataset!')
Beispiel #7
0
def parse(input_file, output_file):
    tf.logging.debug('Input: {}'.format(input_file))
    with python_io.TFRecordWriter(output_file) as writer:
        with tf.gfile.GFile(input_file) as lines:
            for line in lines:
                line = line.strip().split(',')
                features = [float(val) for val in line[:-1]]
                label = int(line[-1])

                #tf.logging.debug('feat: {}'.format(len(features)))
                #tf.logging.debug('label: {}'.format(label))

                example = tf.train.Example()
                example.features.feature['x'].float_list.value.extend(features)
                example.features.feature['y'].int64_list.value.extend([label])
                writer.write(example.SerializeToString())

    tf.logging.debug('Output: {}'.format(output_file))
Beispiel #8
0
def preprocess(config):
  # concept
  # words  [batch_size, doc_len]
  # doclens [batch_size]

  # attr
  # sentence_idxs [batch_size, num_sentences, words_per_sentence]
  # sentence_lens [batch_size, num_sentences]
  # activity_idxs [batch_size, num_sentences, concepts_per_sentence]
  # activity_lens [batch_size, num_sentences]
  # concept_idxs [batch_size, num_sentences, concepts_per_sentence]
  # concept_lens [batch_size, num_sentences]

  # rel
  # concept_mention_idxs [batch_size, num_concept_mentions]
  # mentions_per_concept  [batch_size, global_concepts, global_concepts, mention_pairs, 2]
  #   last dim is [source mention, dest mention] index list
  # relation_mask [batch_size, global_concepts, global_concepts]

  # labels
  # boundary [batch_size, doc_len]
  # <attr_name> [batch_size, num_sentences, concepts_per_sentence]
  # relation [batch_size, global_concepts, global_concepts]

  from tensorflow import python_io as pio
  import os
  from random import random

  word2id = config.word2id
  activity_attributes = config.attr_info.activity_attributes
  common_attributes = config.attr_info.common_attributes

  train_file = os.path.join(config.record_dir, config.train_filename)
  val_file = os.path.join(config.record_dir, config.val_filename)

  num_train, num_val = 0, 0
  with pio.TFRecordWriter(train_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \
    pio.TFRecordWriter(val_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter:
    jdata = json.load(open(config.json_data))
    for json_doc in tqdm(jdata, total=len(jdata)):
      words = []
      slens = []
      word_idxs = np.zeros([config.sents_per_doc, config.toks_per_sent], int)
      activity_lens = []
      activity_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int)
      concept_lens = []
      concept_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int)
      boundary_labels = []
      attribute_labels = defaultdict(lambda: np.zeros([config.sents_per_doc, config.concepts_per_sent], int))
      sent_offset = 0

      for i, sentence in enumerate(json_doc['sentences'][:config.sents_per_doc]):
        swords = [word2id[word] for word in sentence['words'][:config.toks_per_sent]]
        slens.append(len(swords))
        words += swords
        boundary_labels += [config.boundary2id[b] for b in sentence['boundary_labels'][:config.toks_per_sent]]

        word_idxs[i, :slens[i]] = range(sent_offset, sent_offset + slens[i])
        sent_offset += slens[i]

        concepts = sentence['concepts'][:config.concepts_per_sent]
        alen = 0
        concept_lens.append(len(concepts))
        for j, (cidx, label_dict) in enumerate(concepts):
          if 'morphology' in label_dict:
            activity_idxs[i, alen] = cidx
            alen += 1
            for aname in activity_attributes:
              attribute_labels[aname][i, alen] = label_dict[aname]
          concept_idxs[i, j] = cidx
          for aname in common_attributes:
            attribute_labels[aname][i, j] = label_dict[aname]
        activity_lens.append(alen)
      words = words[:config.max_doc_len]
      doclen = len(words)

      # concept_mentions[i] = doc-level id of word corresponding to ith mention
      concept_mentions = np.zeros(config.concept_mentions_per_doc, int)
      # conc2mentions[cid] = [mid] list of mention ids (indexes into concept_mentions)
      conc2mentions = defaultdict(list)
      for i, (concept_id, mention_idxs) in enumerate(json_doc['relations']['mentions'].items()[:config.concept_mentions_per_doc]):
        if int(concept_id) < config.concepts_per_doc:
          for [sent_id, word_id] in mention_idxs:
            if sent_id < config.sents_per_doc and word_id < config.toks_per_sent:
              concept_mentions[i] = word_idxs[sent_id, word_id]
              conc2mentions[int(concept_id)].append(i)
      relation_mask = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      mentions_per_concept = np.zeros([config.concepts_per_doc, config.concepts_per_doc, config.mention_pairs, 2], int)
      mention_pairs_per_conc_pair = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      print('Found %d concepts with %d mentions' % (len(conc2mentions), sum([len(m) for m in conc2mentions.values()])))

      # populate relation mask, mention_pairs_per_conc_pair, and mentions_per_concept
      for head_cid, head_mentions in dict(conc2mentions).iteritems():
        for tail_cid, tail_mentions in dict(conc2mentions).iteritems():
          if head_cid != tail_cid:
            relation_mask[head_cid, tail_cid] = 1
            mention_pairs_per_conc_pair[head_cid, tail_cid] = \
              min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid]))
            for h, head_mention in enumerate(head_mentions):
              for t, tail_mention in enumerate(tail_mentions):
                if h + t >= config.mention_pairs:
                  break
                mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention]
      print('Mask members: %d' % int(np.sum(relation_mask)))

      relation_labels = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int)
      # populate relation_labels
      for label, head_cid, tail_cid in json_doc['relations']['labels']:
        relation_labels[head_cid, tail_cid] = label + 1
        # relation_mask[head_cid, tail_cid] = 1
        # mention_pairs_per_conc_pair[head_cid, tail_cid] = \
        #   min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid]))
        # for h, head_mention in enumerate(conc2mentions[head_cid]):
        #   for t, tail_mention in enumerate(conc2mentions[tail_cid]):
        #     if h + t >= config.mention_pairs:
        #       break
        #     mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention]

      if len(conc2mentions) > 0:
        features = {
          'record_id': _bytes_feature(json_doc['id']),
          'words': _int64_feature(words + [0] * (config.max_doc_len - len(words))),
          'doclen': _int64_feature([doclen]),
          'sentence_idxs': _int64_feature(word_idxs.flatten().tolist()),
          'sentence_lens': _int64_feature(slens + [0] * (config.sents_per_doc - len(slens))),
          'activity_idxs': _int64_feature(activity_idxs.flatten().tolist()),
          'activity_lens': _int64_feature(activity_lens + [0] * (config.sents_per_doc - len(activity_lens))),
          'concept_idxs': _int64_feature(concept_idxs.flatten().tolist()),
          'concept_lens': _int64_feature(concept_lens + [0] * (config.sents_per_doc - len(concept_lens))),
          'concept_mention_idxs': _int64_feature(concept_mentions.tolist()),
          'mentions_per_concept': _int64_feature(mentions_per_concept.flatten().tolist()),
          'relation_mask': _int64_feature(relation_mask.flatten().tolist()),
          'mention_pairs_per_conc_pair': _int64_feature(mention_pairs_per_conc_pair.flatten().tolist()),
          'boundary_labels': _int64_feature(boundary_labels[:config.max_doc_len] +
                                            [config.boundary2id['O']] * (config.max_doc_len - len(boundary_labels))),
          'relation_labels': _int64_feature(relation_labels.flatten().tolist())
        }
        for attr_label, labels_matrix in attribute_labels.iteritems():
          features[attr_label] = _int64_feature(labels_matrix.flatten().tolist())

        example = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString()
        if random() < config.validation_proportion and num_val < 36:
          vwriter.write(example)
          num_val += 1
        else:
          twriter.write(example)
          num_train += 1
  print("Saved %d training examples" % num_train)
  print("Saved %d validation examples" % num_val)
Beispiel #9
0
              batch_size=pack_size,
              ignore_goods=False)
data = next(gen)
dataset_num = data[3]

count = 0

while count * 30 < dataset_num / 25:  #dataset_num :
    try:
        data = next(gen)  # 取出256*30张图片
        starts = count
        ends = starts + pack_size - 1
        count = count + pack_size
        tfrecords_filename = '../video_prediction/data/comma/test/traj_%d_to_%d.tfrecords' % (
            starts, ends)
        writer = python_io.TFRecordWriter(
            tfrecords_filename)  # 创建.tfrecord文件,准备写入

        for i in range(pack_size):
            #data = next(gen)
            img = np.transpose(data[0][i], (0, 2, 3, 1))
            angle = data[1][i]
            speed = data[2][i]
            img_raw = []
            for k in range(img.shape[0]):
                #img_resize = transform.rescale(img[k],0.25,mode='constant')   # resize to 40*80
                #img_raw.append(img_resize.tostring())
                img_raw.append(img[k].tostring())

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    '0/angle':