def create_examples(data, bert_client, training=True, label2int=None, class_weight=None): """ data: pd.DataFrame label2int: dict class_weight: list yield examples """ idx_start = data.index[0] A_encoded = bert_client.encode(data['title1_en'].tolist()) B_encoded = bert_client.encode(data['title2_en'].tolist()) for i in range(len(data)): feature = { 'A_encoded': Feature(float_list=FloatList(value=A_encoded[i])), 'B_encoded': Feature(float_list=FloatList(value=B_encoded[i])) } if training: label = label2int[data.loc[idx_start + i, 'label']] feature['label'] = Feature(int64_list=Int64List(value=[label])) feature['class_weight'] = Feature(float_list=FloatList( value=[class_weight[label]])) else: feature['id'] = Feature(int64_list=Int64List( value=[data.loc[idx_start + i, 'id']])) yield Example(features=Features(feature=feature))
def serialise(data): ID,pos, dimensions, color, border, fill, text, img, seq_len, seq_mask = data['ID'], data['pos'], data['dimensions'], data['color'], \ data['border'], data['fill'], data['text'], data['img'], \ int(data['seq_len']), data['seq_mask'] \ ID = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(ID).numpy(),])) pos = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(pos, tf.float32)).numpy(),])) dimensions = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(dimensions, tf.float32)).numpy(),])) color = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(color, tf.float32)).numpy(),])) border = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(border, tf.float32)).numpy(),])) fill = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(fill, tf.float32)).numpy(),])) text = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(tf.cast(text, tf.float32)).numpy(),])) img = Feature(bytes_list=BytesList(value=[img.numpy(),])) seq_len = Feature(int64_list=Int64List(value=[seq_len,])) seq_mask = Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(seq_mask).numpy(),])) # img is already serialised because we never decode it! features = Features(feature={ 'ID' : ID, 'pos' : pos, 'dimensions' : dimensions, 'color' : color, 'border' : border, 'fill' : fill, 'text' : text, 'img': img, 'seq_len':seq_len, 'seq_mask':seq_mask, }) example = Example(features=features) return example.SerializeToString()
def make_tf_examples(string_features, int_features, labels): int_features += [[label] for label in zero_norm_labels(labels)] string_features = [ Feature(bytes_list=BytesList(value=val)) for val in string_features ] int_features = [ Feature(int64_list=Int64List(value=val)) for val in int_features ] all_features = string_features + int_features return [ Example(features=Features( feature={ "left": left, "target": target, "right": right, "left_ids": left_ids, "target_ids": target_ids, "right_ids": right_ids, "labels": label, })) for ( left, target, right, left_ids, target_ids, right_ids, label, ) in zip(*split_list(all_features, parts=7)) ]
def convert_to_example( adj, feature, label_data=None, label_mask=None, ): """ Writes graph related data to disk. """ adj_row, adj_col = np.nonzero(adj) adj_values = adj[adj_row, adj_col] adj_elem_len = len(adj_row) degrees = np.sum(adj, 0) adj_degrees = [] for ar, ac in zip(adj_row, adj_col): if ar == ac: adj_degrees.append(0) else: adj_degrees.append(int(degrees[ar])) feature = np.array(feature) feature_row, feature_col = np.nonzero(feature) feature_values = feature[feature_row, feature_col] feature_elem_len = len(feature_row) feature = { 'adj_row': Feature(int64_list=Int64List(value=list(adj_row))), 'adj_column': Feature(int64_list=Int64List(value=list(adj_col))), 'adj_values': Feature(float_list=FloatList(value=list(adj_values))), 'adj_elem_len': Feature(int64_list=Int64List(value=[adj_elem_len])), 'adj_degrees': Feature(int64_list=Int64List(value=adj_degrees)), 'feature_row': Feature(int64_list=Int64List(value=list(feature_row))), 'feature_column': Feature(int64_list=Int64List(value=list(feature_col))), 'feature_values': Feature(float_list=FloatList(value=list(feature_values))), 'feature_elem_len': Feature(int64_list=Int64List(value=[feature_elem_len])), 'size': Feature(int64_list=Int64List(value=list(feature.shape))) } if label_data is not None: label_data = np.nan_to_num(label_data) feature['label'] = Feature(int64_list=Int64List( value=label_data.astype(int))) feature['mask_label'] = Feature(int64_list=Int64List( value=label_mask.astype(int))), features = Features(feature=feature) ex = Example(features=features) return ex.SerializeToString()
def create_example(features: np.ndarray, label: np.int32): return Example(features=Features( feature={ "features": NumpyToRecordConverter._bytes_feature( tf.io.serialize_tensor(features)), "label": Feature(int64_list=Int64List(value=[label])) })).SerializeToString()
def serialise_traj(data): features = {k: Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(v).numpy(),])) for k,v in data.items() if k not in ['seq_lens']} features['seq_lens'] = Feature(int64_list=Int64List(value=[data['seq_lens'],])) example = Example(features=Features(feature=features)) return example.SerializeToString()
def serialise_vid(data): # seq_lens, masks, imgs, goal_imgs,label, label_embedding, tag = data['seq_lens'], data['masks'], data['imgs'], data['goal_imgs'], data['label'], data['label_embedding'], data['tag'] features = {k: Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(v).numpy(),])) for k,v in data.items() if k not in ['seq_lens']} features['seq_lens'] = Feature(int64_list=Int64List(value=[data['seq_lens'],])) example = Example(features=Features(features)) return example.SerializeToString()
def __encode_input(self, mr, input_encoder): """Encodes the input, and creates a TF Example record out of it.""" input_ids = input_encoder.encode(mr) input_ids.append(text_encoder.EOS_ID) features = {'inputs': Feature(int64_list=Int64List(value=input_ids))} example = Example(features=Features(feature=features)) return example.SerializeToString()
def write_to_tfrecords(adj, feature, label_data, label_mask, tfrname): """ Writes graph related data to disk. """ adj_row, adj_col = np.nonzero(adj) adj_values = adj[adj_row, adj_col] adj_elem_len = len(adj_row) feature = np.array(feature) feature_row, feature_col = np.nonzero(feature) feature_values = feature[feature_row, feature_col] feature_elem_len = len(feature_row) features = Features( feature={ 'label': Feature(int64_list=Int64List(value=label_data)), 'mask_label': Feature(int64_list=Int64List(value=label_mask)), 'adj_row': Feature(int64_list=Int64List(value=list(adj_row))), 'adj_column': Feature(int64_list=Int64List(value=list(adj_col))), 'adj_values': Feature(float_list=FloatList(value=list(adj_values))), 'adj_elem_len': Feature(int64_list=Int64List(value=[adj_elem_len])), 'feature_row': Feature(int64_list=Int64List(value=list(feature_row))), 'feature_column': Feature(int64_list=Int64List(value=list(feature_col))), 'feature_values': Feature(float_list=FloatList(value=list(feature_values))), 'feature_elem_len': Feature(int64_list=Int64List(value=[feature_elem_len])), 'size': Feature(int64_list=Int64List(value=list(feature.shape))) }) ex = Example(features=features) with TFRecordWriter(tfrname) as single_writer: single_writer.write(ex.SerializeToString())
def _process_page(page_title: str): if _entity_vocab.contains(page_title, _language): page_id = _entity_vocab.get_id(page_title, _language) else: page_id = -1 sentences = [] def tokenize(text: str, add_prefix_space: bool): # clean up multiple spaces text = re.sub(r"\s+", " ", text).rstrip() if not text: return [] if isinstance(_tokenizer, RobertaTokenizer): return _tokenizer.tokenize(text, add_prefix_space=add_prefix_space) else: return _tokenizer.tokenize(text) for paragraph in _dump_db.get_paragraphs(page_title): paragraph_text = paragraph.text # First, get paragraph links. # Parapraph links are represented as (link_title) and the start/end positions of strings # (link_start, link_end). paragraph_links = [] for link in paragraph.wiki_links: link_title = _dump_db.resolve_redirect(link.title) # remove category links if link_title.startswith("Category:") and link.text.lower( ).startswith("category:"): paragraph_text = (paragraph_text[:link.start] + " " * (link.end - link.start) + paragraph_text[link.end:]) else: if _entity_vocab.contains(link_title, _language): paragraph_links.append( (link_title, link.start, link.end)) elif _include_unk_entities: paragraph_links.append( (UNK_TOKEN, link.start, link.end)) sent_spans = _sentence_tokenizer.span_tokenize( paragraph_text.rstrip()) for sent_start, sent_end in sent_spans: cur = sent_start sent_words = [] sent_links = [] # Look for links that are within the tokenized sentence. # If a link is found, we separate the sentences across the link and tokenize them. for link_title, link_start, link_end in paragraph_links: if not (sent_start <= link_start < sent_end and link_end <= sent_end): continue entity_id = _entity_vocab.get_id(link_title, _language) # read from the beginning of the sentence (or current cursor) to beginning of linked text text = paragraph_text[cur:link_start] # the add_prefix_space thing is because of the way RoBERTa was trained # from tf library: "This tokenizer has been trained to treat spaces like parts of the tokens # (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning # of the sentence (without space) or not" if cur == 0 or text.startswith(" ") or paragraph_text[ cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) # read the linked text link_text = paragraph_text[link_start:link_end] # tokenize the linked words, add spaces as necessary if link_start == 0 or link_text.startswith( " ") or paragraph_text[link_start - 1] == " ": link_words = tokenize(link_text, True) else: link_words = tokenize(link_text, False) # add the entities + the start and end number of tokens for the entity # IMPORTANT sent_links.append((entity_id, len(sent_words), len(sent_words) + len(link_words))) # add entity words to the end of the sentence words # this gets us our fully tokenized text sent_words += link_words cur = link_end text = paragraph_text[cur:sent_end] if cur == 0 or text.startswith(" ") or paragraph_text[ cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) if len(sent_words) < _min_sentence_length or len( sent_words) > _max_num_tokens: continue sentences.append((sent_words, sent_links)) ret = [] words = [] links = [] # loop through the sentences in the paragraph # each sentence is a tf Example for i, (sent_words, sent_links) in enumerate(sentences): links += [(id_, start + len(words), end + len(words)) for id_, start, end in sent_links] words += sent_words if i == len(sentences) - 1 or len(words) + len( sentences[i + 1][0]) > _max_num_tokens: if links or _include_sentences_without_entities: links = links[:_max_entity_length] # get the IDs based on the word list word_ids = _tokenizer.convert_tokens_to_ids(words) assert _min_sentence_length <= len( word_ids) <= _max_num_tokens # get the entity IDs from our entity vocab entity_ids = [id_ for id_, _, _, in links] assert len(entity_ids) <= _max_entity_length # this is the position of the entities in the text? entity_position_ids = itertools.chain( *[(list(range(start, end)) + [-1] * (_max_mention_length - end + start) )[:_max_mention_length] for _, start, end in links]) example = tf.train.Example(features=tf.train.Features( feature=dict( page_id=tf.train.Feature( int64_list=tf.train.Int64List( value=[page_id])), word_ids=tf.train.Feature( int64_list=tf.train.Int64List(value=word_ids)), entity_ids=tf.train.Feature( int64_list=tf.train.Int64List( value=entity_ids)), entity_position_ids=tf.train.Feature( int64_list=Int64List( value=entity_position_ids)), ))) ret.append((example.SerializeToString())) words = [] links = [] return ret
def _process_page(page_title: str): if _entity_vocab.contains(page_title, _language): page_id = _entity_vocab.get_id(page_title, _language) else: page_id = -1 sentences = [] def tokenize(text: str, add_prefix_space: bool): text = re.sub(r"\s+", " ", text).rstrip() if not text: return [] if isinstance(_tokenizer, RobertaTokenizer): return _tokenizer.tokenize(text, add_prefix_space=add_prefix_space) else: return _tokenizer.tokenize(text) for paragraph in _dump_db.get_paragraphs(page_title): paragraph_text = paragraph.text # First, get paragraph links. # Parapraph links are represented its form (link_title) and the start/end positions of strings # (link_start, link_end). paragraph_links = [] for link in paragraph.wiki_links: link_title = _dump_db.resolve_redirect(link.title) # remove category links if link_title.startswith("Category:") and link.text.lower( ).startswith("category:"): paragraph_text = (paragraph_text[:link.start] + " " * (link.end - link.start) + paragraph_text[link.end:]) else: if _entity_vocab.contains(link_title, _language): paragraph_links.append( (link_title, link.start, link.end)) elif _include_unk_entities: paragraph_links.append( (UNK_TOKEN, link.start, link.end)) sent_spans = _sentence_tokenizer.span_tokenize( paragraph_text.rstrip()) for sent_start, sent_end in sent_spans: cur = sent_start sent_words = [] sent_links = [] # Look for links that are within the tokenized sentence. # If a link is found, we separate the sentences across the link and tokenize them. for link_title, link_start, link_end in paragraph_links: if not (sent_start <= link_start < sent_end and link_end <= sent_end): continue entity_id = _entity_vocab.get_id(link_title, _language) text = paragraph_text[cur:link_start] if cur == 0 or text.startswith(" ") or paragraph_text[ cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) link_text = paragraph_text[link_start:link_end] if link_start == 0 or link_text.startswith( " ") or paragraph_text[link_start - 1] == " ": link_words = tokenize(link_text, True) else: link_words = tokenize(link_text, False) sent_links.append((entity_id, len(sent_words), len(sent_words) + len(link_words))) sent_words += link_words cur = link_end text = paragraph_text[cur:sent_end] if cur == 0 or text.startswith(" ") or paragraph_text[ cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) if len(sent_words) < _min_sentence_length or len( sent_words) > _max_num_tokens: continue sentences.append((sent_words, sent_links)) ret = [] words = [] links = [] for i, (sent_words, sent_links) in enumerate(sentences): links += [(id_, start + len(words), end + len(words)) for id_, start, end in sent_links] words += sent_words if i == len(sentences) - 1 or len(words) + len( sentences[i + 1][0]) > _max_num_tokens: if links or _include_sentences_without_entities: links = links[:_max_entity_length] word_ids = _tokenizer.convert_tokens_to_ids(words) assert _min_sentence_length <= len( word_ids) <= _max_num_tokens entity_ids = [id_ for id_, _, _, in links] assert len(entity_ids) <= _max_entity_length entity_position_ids = itertools.chain( *[(list(range(start, end)) + [-1] * (_max_mention_length - end + start) )[:_max_mention_length] for _, start, end in links]) example = tf.train.Example(features=tf.train.Features( feature=dict( page_id=tf.train.Feature( int64_list=tf.train.Int64List( value=[page_id])), word_ids=tf.train.Feature( int64_list=tf.train.Int64List(value=word_ids)), entity_ids=tf.train.Feature( int64_list=tf.train.Int64List( value=entity_ids)), entity_position_ids=tf.train.Feature( int64_list=Int64List( value=entity_position_ids)), ))) ret.append((example.SerializeToString())) words = [] links = [] return ret
def _create_int_feature(self, values): return Feature(int64_list=Int64List(value=list(values)))
def serialise(data): obs, acts, goals, seq_lens, masks, dataset_path, tstep_idxs , imgs , goal_imgs, proprioceptive_features = data['obs'], \ data['acts'], data['goals'], data['seq_lens'], data['masks'], data['dataset_path'], data['tstep_idxs'], data['imgs'], data['goal_imgs'], data['proprioceptive_features'] # obs (1, 40, 18) # acts (1, 40, 7) # goals (1, 40, 11) # seq_lens (1,) # masks (1, 40) # dataset_path (1, 40) # tstep_idxs (1, 40) # imgs (1, 40, 200, 200, 3) # goal_imgs (1, 40, 200, 200, 3) # proprioceptive_features (1, 40, 7) goal_imgs = tf.expand_dims( goal_imgs[:, 0, :, :, :], 1) # crete a :, 1, :,:,: shaped goal images for less file IO obs = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(obs)).numpy(), ])) acts = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(acts)).numpy(), ])) goals = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(goals)).numpy(), ])) seq_lens = Feature(int64_list=Int64List(value=[ seq_lens, ])) masks = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(masks)).numpy(), ])) imgs = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(imgs)).numpy(), ])) goal_imgs = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(goal_imgs)).numpy(), ])) proprioceptive_features = Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(tf.squeeze(proprioceptive_features)).numpy(), ])) features = Features( feature={ 'obs': obs, 'acts': acts, 'goals': goals, 'seq_lens': seq_lens, 'masks': masks, 'imgs': imgs, 'goal_imgs': goal_imgs, 'proprioceptive_features': proprioceptive_features }) example = Example(features=features) return example.SerializeToString() # Sample Usage # r = lfp.data.PlayDataloader(include_imgs = args.images, batch_size=1, window_size=args.window_size_max, min_window_size=args.window_size_min) # rd = r.extract(TRAIN_DATA_PATHS, from_tfrecords=args.from_tfrecords) # rd = r.load(rd) # r_it = iter(rd) # @tf.function # def sample(): # return r_it.next() # data_paths = [str(STORAGE_PATH/'precompute')+f"/{x}.tfrecords" for x in range(0,8)] # #@title write to gcs # from tqdm import tqdm # for path in data_paths: # with tf.io.TFRecordWriter(path) as file_writer: # print(path) # for i in tqdm(range(0,200)): # byte_stream = serialise(sample()) # file_writer.write(byte_stream)
def _process_page(pmid: str): # print("start _process_page", pmid) if _entity_vocab.page_contains(pmid): # page_id = _entity_vocab.get_id(pmid) # TODO: verify if this is okay # we just use the PMID as the page_id, it doesn't look like it is used anywhere really # so should be fine. page_id = int(pmid) else: page_id = -1 sentences = [] def tokenize(text: str, add_prefix_space: bool): # clean up multiple spaces text = re.sub(r"\s+", " ", text).rstrip() if not text: return [] if isinstance(_tokenizer, RobertaTokenizer): return _tokenizer.tokenize(text, add_prefix_space=add_prefix_space) else: return _tokenizer.tokenize(text) # print("start get data") # we concatenate the title and abstract like they do in MedMentions to get the entity spans to match page_data = _medmentions_db.get_data()[pmid] paragraph_text = page_data['title'] + " " + page_data['abstract'] # print("end get data") # First, get paragraph links. # Parapraph links are represented as (link_title) and the start/end positions of strings # (link_start, link_end). paragraph_links = [] # print("start loop through entities") for entity in page_data['entities']: if _entity_vocab.contains(entity[4], _language): paragraph_links.append((entity[4], entity[0], entity[1])) elif _include_unk_entities: paragraph_links.append((UNK_TOKEN, entity[0], entity[1])) # print("stop loop through entities") sent_spans = _sentence_tokenizer.span_tokenize(paragraph_text.rstrip()) for sent_start, sent_end in sent_spans: cur = sent_start sent_words = [] sent_links = [] # Look for links that are within the tokenized sentence. # If a link is found, we separate the sentences across the link and tokenize them. for cui_id, ent_start, ent_end in paragraph_links: if not (sent_start <= ent_start < sent_end and ent_end <= sent_end): continue entity_id = _entity_vocab.get_id(cui_id, _language) # read from the beginning of the sentence (or current cursor) to beginning of linked text text = paragraph_text[cur:ent_start] # the add_prefix_space thing is because of the way RoBERTa was trained # from tf library: "This tokenizer has been trained to treat spaces like parts of the tokens # (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning # of the sentence (without space) or not" if cur == 0 or text.startswith(" ") or paragraph_text[ cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) # read the linked text link_text = paragraph_text[ent_start:ent_end] # tokenize the linked words, add spaces as necessary if ent_start == 0 or link_text.startswith( " ") or paragraph_text[ent_start - 1] == " ": link_words = tokenize(link_text, True) else: link_words = tokenize(link_text, False) # add the entities + the start and end number of tokens for the entity # IMPORTANT sent_links.append((entity_id, len(sent_words), len(sent_words) + len(link_words))) # add entity words to the end of the sentence words # this gets us our fully tokenized text sent_words += link_words cur = ent_end text = paragraph_text[cur:sent_end] if cur == 0 or text.startswith(" ") or paragraph_text[cur - 1] == " ": sent_words += tokenize(text, True) else: sent_words += tokenize(text, False) if len(sent_words) < _min_sentence_length or len( sent_words) > _max_num_tokens: continue sentences.append((sent_words, sent_links)) # print("finish sent spans") ret = [] words = [] links = [] # loop through the sentences in the paragraph for i, (sent_words, sent_links) in enumerate(sentences): links += [(id_, start + len(words), end + len(words)) for id_, start, end in sent_links] words += sent_words # we only create the tf example on the last sentence/if we hit the max number of tokens if i == len(sentences) - 1 or len(words) + len( sentences[i + 1][0]) > _max_num_tokens: if links or _include_sentences_without_entities: links = links[:_max_entity_length] # get the IDs based on the word list word_ids = _tokenizer.convert_tokens_to_ids(words) assert _min_sentence_length <= len( word_ids) <= _max_num_tokens # get the entity IDs from our entity vocab entity_ids = [id_ for id_, _, _, in links] assert len(entity_ids) <= _max_entity_length # this is the position of the entities in the text? entity_position_ids = itertools.chain( *[(list(range(start, end)) + [-1] * (_max_mention_length - end + start) )[:_max_mention_length] for _, start, end in links]) example = tf.train.Example(features=tf.train.Features( feature=dict( page_id=tf.train.Feature( int64_list=tf.train.Int64List( value=[page_id])), word_ids=tf.train.Feature( int64_list=tf.train.Int64List(value=word_ids)), entity_ids=tf.train.Feature( int64_list=tf.train.Int64List( value=entity_ids)), entity_position_ids=tf.train.Feature( int64_list=Int64List( value=entity_position_ids)), ))) ret.append((example.SerializeToString())) words = [] links = [] # print("about to return") return ret