def preprocess(config): from tensorflow import python_io as pio import os from random import random data_dir = config.pretrain_data_dir word2id = json.load(open(os.path.join(data_dir, 'word2id.json'))) slen = config.toks_per_sent numval, numtrain = 0, 0 with pio.TFRecordWriter(os.path.join(data_dir, 'train.tfrecord'), options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \ pio.TFRecordWriter(os.path.join(data_dir, 'val.tfrecord'), options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter: jdata = json.load(open(os.path.join(data_dir, 'data.json'))) for example in tqdm(jdata, total=len(jdata)): tokens = [word2id['[CLS]']] + example['s1']['tokens'] + [word2id['[SEP]']] \ + example['s2']['tokens'] + [word2id['[SEP]']] s1_len = len(example['s1']['tokens']) s2_len = len(example['s2']['tokens']) s1_labels, s1_mask = _labels_and_mask(example['s1']['labels'], s1_len) s2_labels, s2_mask = _labels_and_mask(example['s2']['labels'], s2_len) a_mask = [0.] + [1.] * s1_len + [0.] * (2 + s2_len) b_mask = [0.] * (2 + s1_len) + [1.] * s2_len + [0.] lm_labels = [0] + s1_labels + [0] + s2_labels + [0] lm_mask = [0.] + s1_mask + [0.] + s2_mask + [0.] features = { 'label': _int64_feature([example['label']]), 'tokens': _padded_int64_feature(tokens, slen), 'a_mask': _padded_float_feature(a_mask, slen), 'b_mask': _padded_float_feature(b_mask, slen), 'lm_labels': _padded_int64_feature(lm_labels, slen), 'lm_mask': _padded_float_feature(lm_mask, slen) } if random() > 0.05: numtrain += 1 twriter.write( tf.train.Example(features=tf.train.Features( feature=features)).SerializeToString()) else: numval += 1 vwriter.write( tf.train.Example(features=tf.train.Features( feature=features)).SerializeToString()) print('Done! Created %d training examples and %d validation examples' % (numtrain, numval))
def read_minst_from_tfrecords(filequeue, shape, one_hot=0, GZ=True): # Note: if num_epochs is not None, this function creates local counter epochs. # Use local_variables_initializer() to initialize local variables. # refer to document of string_input_producer(). options = None if GZ: options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP) reader = tf.TFRecordReader(options=options) _, serialized_example = reader.read(filequeue) features = tf.parse_single_example( serialized_example, features={ 'image_raw': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64), # 'height': tf.FixedLenFeature([], tf.int64), # 'width': tf.FixedLenFeature([], tf.int64), # 'depth': tf.FixedLenFeature([], tf.int64) }) image = tf.decode_raw(features['image_raw'], tf.float32) image.set_shape([shape]) label = tf.cast(features['label'], tf.int32) if one_hot > 0: label = tf.one_hot(label, one_hot) return image, label
def convert_to_tfrecords(file, data, height, width, depth, GZ=True): ''' :param data: must be a numpy array with two dimension, data[0] contains image data, data[1] for labels. ''' images = data[0] labels = data[1] num_ex = np.size(data[1]) filename = file + '.tfrecords' options = None if GZ: filename += '.gz' options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP) print('Writing ', filename) writer = tf.python_io.TFRecordWriter(filename, options=options) for i in range(num_ex): print(i) image_raw = images[i].tostring() example = tf.train.Example(features=tf.train.Features( feature={ 'height': _int64_features(height), 'width': _int64_features(width), 'depth': _int64_features(depth), 'label': _int64_features(int(labels[i])), 'image_raw': _bytes_features(image_raw) })) writer.write(example.SerializeToString()) writer.close() print('finished')
def convert_to_tfrecords(data, out_file, GZ=True): image = data[0] labels = data[1] count = np.size(labels) filename = out_file + '.tfrecords' options = None if GZ: filename += '.gz' options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP) if os.path.exists(filename): print("File %s exists" % filename) return print("writing to %s..." % filename) writer = tpio.TFRecordWriter(filename, options=options) for i in range(count): sys.stdout.write("%d\r" % (i + 1)) image_raw = image[i].tostring() example = tf.train.Example(features=tf.train.Features( feature={ 'label': _int64_features(labels[i]), 'image_raw': _bytes_features(image_raw) })) writer.write(example.SerializeToString()) writer.close() print("\nfinished")
def __enter__(self): if os.path.exists(self.output_file): raise IOError("file %s exists" % self.output_file) else: options = tpio.TFRecordOptions(tpio.TFRecordCompressionType.GZIP) self.writer = tpio.TFRecordWriter(self.output_file, options=options) return self
def preprocess(config): # concept # words [batch_size, doc_len] # doclens [batch_size] # attr # sentence_idxs [batch_size, num_sentences, words_per_sentence] # sentence_lens [batch_size, num_sentences] # activity_idxs [batch_size, num_sentences, concepts_per_sentence] # activity_lens [batch_size, num_sentences] # concept_idxs [batch_size, num_sentences, concepts_per_sentence] # concept_lens [batch_size, num_sentences] # rel # concept_mention_idxs [batch_size, num_concept_mentions] # mentions_per_concept [batch_size, global_concepts, global_concepts, mention_pairs, 2] # last dim is [source mention, dest mention] index list # relation_mask [batch_size, global_concepts, global_concepts] # labels # boundary [batch_size, doc_len] # <attr_name> [batch_size, num_sentences, concepts_per_sentence] # relation [batch_size, global_concepts, global_concepts] from tensorflow import python_io as pio import os from random import random word2id = config.word2id activity_attributes = config.attr_info.activity_attributes common_attributes = config.attr_info.common_attributes train_file = os.path.join(config.record_dir, config.train_filename) val_file = os.path.join(config.record_dir, config.val_filename) num_train, num_val = 0, 0 with pio.TFRecordWriter(train_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as twriter, \ pio.TFRecordWriter(val_file, options=pio.TFRecordOptions(pio.TFRecordCompressionType.GZIP)) as vwriter: jdata = json.load(open(config.json_data)) for json_doc in tqdm(jdata, total=len(jdata)): words = [] slens = [] word_idxs = np.zeros([config.sents_per_doc, config.toks_per_sent], int) activity_lens = [] activity_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int) concept_lens = [] concept_idxs = np.zeros([config.sents_per_doc, config.concepts_per_sent], int) boundary_labels = [] attribute_labels = defaultdict(lambda: np.zeros([config.sents_per_doc, config.concepts_per_sent], int)) sent_offset = 0 for i, sentence in enumerate(json_doc['sentences'][:config.sents_per_doc]): swords = [word2id[word] for word in sentence['words'][:config.toks_per_sent]] slens.append(len(swords)) words += swords boundary_labels += [config.boundary2id[b] for b in sentence['boundary_labels'][:config.toks_per_sent]] word_idxs[i, :slens[i]] = range(sent_offset, sent_offset + slens[i]) sent_offset += slens[i] concepts = sentence['concepts'][:config.concepts_per_sent] alen = 0 concept_lens.append(len(concepts)) for j, (cidx, label_dict) in enumerate(concepts): if 'morphology' in label_dict: activity_idxs[i, alen] = cidx alen += 1 for aname in activity_attributes: attribute_labels[aname][i, alen] = label_dict[aname] concept_idxs[i, j] = cidx for aname in common_attributes: attribute_labels[aname][i, j] = label_dict[aname] activity_lens.append(alen) words = words[:config.max_doc_len] doclen = len(words) # concept_mentions[i] = doc-level id of word corresponding to ith mention concept_mentions = np.zeros(config.concept_mentions_per_doc, int) # conc2mentions[cid] = [mid] list of mention ids (indexes into concept_mentions) conc2mentions = defaultdict(list) for i, (concept_id, mention_idxs) in enumerate(json_doc['relations']['mentions'].items()[:config.concept_mentions_per_doc]): if int(concept_id) < config.concepts_per_doc: for [sent_id, word_id] in mention_idxs: if sent_id < config.sents_per_doc and word_id < config.toks_per_sent: concept_mentions[i] = word_idxs[sent_id, word_id] conc2mentions[int(concept_id)].append(i) relation_mask = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int) mentions_per_concept = np.zeros([config.concepts_per_doc, config.concepts_per_doc, config.mention_pairs, 2], int) mention_pairs_per_conc_pair = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int) print('Found %d concepts with %d mentions' % (len(conc2mentions), sum([len(m) for m in conc2mentions.values()]))) # populate relation mask, mention_pairs_per_conc_pair, and mentions_per_concept for head_cid, head_mentions in dict(conc2mentions).iteritems(): for tail_cid, tail_mentions in dict(conc2mentions).iteritems(): if head_cid != tail_cid: relation_mask[head_cid, tail_cid] = 1 mention_pairs_per_conc_pair[head_cid, tail_cid] = \ min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid])) for h, head_mention in enumerate(head_mentions): for t, tail_mention in enumerate(tail_mentions): if h + t >= config.mention_pairs: break mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention] print('Mask members: %d' % int(np.sum(relation_mask))) relation_labels = np.zeros([config.concepts_per_doc, config.concepts_per_doc], int) # populate relation_labels for label, head_cid, tail_cid in json_doc['relations']['labels']: relation_labels[head_cid, tail_cid] = label + 1 # relation_mask[head_cid, tail_cid] = 1 # mention_pairs_per_conc_pair[head_cid, tail_cid] = \ # min(config.mention_pairs, len(conc2mentions[head_cid]) * len(conc2mentions[tail_cid])) # for h, head_mention in enumerate(conc2mentions[head_cid]): # for t, tail_mention in enumerate(conc2mentions[tail_cid]): # if h + t >= config.mention_pairs: # break # mentions_per_concept[head_cid, tail_cid, h + t] = [head_mention, tail_mention] if len(conc2mentions) > 0: features = { 'record_id': _bytes_feature(json_doc['id']), 'words': _int64_feature(words + [0] * (config.max_doc_len - len(words))), 'doclen': _int64_feature([doclen]), 'sentence_idxs': _int64_feature(word_idxs.flatten().tolist()), 'sentence_lens': _int64_feature(slens + [0] * (config.sents_per_doc - len(slens))), 'activity_idxs': _int64_feature(activity_idxs.flatten().tolist()), 'activity_lens': _int64_feature(activity_lens + [0] * (config.sents_per_doc - len(activity_lens))), 'concept_idxs': _int64_feature(concept_idxs.flatten().tolist()), 'concept_lens': _int64_feature(concept_lens + [0] * (config.sents_per_doc - len(concept_lens))), 'concept_mention_idxs': _int64_feature(concept_mentions.tolist()), 'mentions_per_concept': _int64_feature(mentions_per_concept.flatten().tolist()), 'relation_mask': _int64_feature(relation_mask.flatten().tolist()), 'mention_pairs_per_conc_pair': _int64_feature(mention_pairs_per_conc_pair.flatten().tolist()), 'boundary_labels': _int64_feature(boundary_labels[:config.max_doc_len] + [config.boundary2id['O']] * (config.max_doc_len - len(boundary_labels))), 'relation_labels': _int64_feature(relation_labels.flatten().tolist()) } for attr_label, labels_matrix in attribute_labels.iteritems(): features[attr_label] = _int64_feature(labels_matrix.flatten().tolist()) example = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString() if random() < config.validation_proportion and num_val < 36: vwriter.write(example) num_val += 1 else: twriter.write(example) num_train += 1 print("Saved %d training examples" % num_train) print("Saved %d validation examples" % num_val)