Example #1
0
def save_tfrecords(save_dir, train_list, eval_list, test_list, idx):
    with TFRecordWriter(os.path.join(save_dir,
                                     f"{idx}_train_.tfrecords")) as writer:
        for e in train_list:
            writer.write(e)
    with TFRecordWriter(os.path.join(save_dir,
                                     f"{idx}_test_.tfrecords")) as writer:
        for e in test_list:
            writer.write(e)
    with TFRecordWriter(os.path.join(save_dir,
                                     f"{idx}_eval_.tfrecords")) as writer:
        for e in eval_list:
            writer.write(e)
Example #2
0
def write_tfr_batches(data, label,batch_size, num_batches, savepath, dataset_type):
    start =0 
    next_start = 0

    for batch in range(num_batches):
        #print(batch)
        start = batch*batch_size
        filename = '{}_0{}.tfrecord'.format(dataset_type,batch)
        filepath = os.path.join(savepath,filename)
        with open(filepath,'w') as f:
            writer = TFRecordWriter(f.name)

        if(batch != num_batches-1):
            next_start = (batch+1)*batch_size
        else:
            next_start = len(data)

        for i in range(start,next_start):
            #write_tfrecord(data[star:next_start], out_path, )
            record = sequence_to_tfexample(sequence = data[i], sentiment = label[i])
            writer.write(record.SerializeToString())
Example #3
0
def preprocess(dataset, destination_folder, split_adj):
    """
    Preprocesses tox21 data
    Args:
        dataet
        destination_folder
        split_adj
    Return:
        none
    """
    tox21_df = pd.read_csv(dataset)
    task_names = list(tox21_df.columns)
    task_names.remove('smiles')
    task_names.remove('mol_id')
    if not os.path.exists(destination_folder):
        os.mkdir(destination_folder)
    train_num = int(len(tox21_df) * 0.8)
    eval_num = int(len(tox21_df) * 0.1)
    test_num = len(tox21_df) - train_num - eval_num
    split = ['train'] * train_num + ['eval'] * eval_num + ['test'] * test_num
    shuffle(split)
    tox21_df['split'] = split

    Molecule = recordclass('Molecule', 'mol label mask_label')

    for split in ['train', 'eval', 'test']:
        split_df = tox21_df[tox21_df['split'] == split]
        molecules = []
        for index, row in split_df.iterrows():
            mol = Chem.MolFromSmiles(row['smiles'])
            label = (row[:12].to_dense().values == 1).astype(
                np.float32).tolist()
            mask_label = np.invert(np.isnan(row[:12].values.astype(
                np.float32))).astype(np.float32)
            molecule = Molecule(mol, label, mask_label)
            molecules.append(molecule)
            with TFRecordWriter(
                    os.path.join(destination_folder, row['mol_id'] + '_' +
                                 split + '.tfrecords')) as single_writer:
                ex = molecule_to_example(molecule, split_adj)
                single_writer.write(ex.SerializeToString())
        #with tf.python_io.TFRecordWriter(os.path.join(dir_name, '_' + split + '.tfrecords')) as dataset_writer:
        #for molecule in molecules:
        #ex = molecule_to_example(molecule, split_adj)
        #dataset_writer.write(ex.SerializeToString())

    task_names = "\n".join(task_names)
    with open(os.path.join(destination_folder, 'tasks.txt'), 'w') as text_file:
        text_file.write(task_names)
Example #4
0
def make_dataset(data_path,
                 output_dir,
                 name,
                 bert_client,
                 training=True,
                 label2int=None,
                 class_weight=None,
                 n_split=1):
    """
    data_path: path to the data (csv)
    label2int: dict
    class_weight: list
    n_split: Save the dataset to `n_split` seperated files

    Write dataset to ${output_dir}/${name}_${seq}.tfrecord (seq = 0 ~ n_split-1)
    
    Return file names of the created datasets (list), size of the dataset
    """

    data = pd.read_csv(data_path)

    # replace empty titles with 'none'
    data['title1_en'] = data['title1_en'].apply(lambda x: 'none'
                                                if x.strip() == '' else x)
    data['title2_en'] = data['title2_en'].apply(lambda x: 'none'
                                                if x.strip() == '' else x)

    n_samples = math.ceil(len(data) / n_split)
    filenames = []

    with tqdm(total=len(data)) as pbar:

        for i in range(n_split):
            filenames.append(f"{name}_{i}.tfrecord")
            with TFRecordWriter(os.path.join(output_dir,
                                             filenames[-1])) as writer:
                examples = create_examples(data=data[i * n_samples:(i + 1) *
                                                     n_samples],
                                           bert_client=bert_client,
                                           training=training,
                                           label2int=label2int,
                                           class_weight=class_weight)
                for example in examples:
                    writer.write(example.SerializeToString())
                    pbar.update()

    return filenames, len(data)
Example #5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input_files', nargs='+', metavar='INPUT-FILE')
    parser.add_argument('output_file', metavar='OUTPUT-FILE')
    parser.add_argument(
        '-i',
        '--instrument-re',
        type=re.compile,
        default=re.compile('.*'),
        metavar='REGEX',
        help='a regular expression matching the instrument name')
    parser.add_argument('--instrument-id',
                        type=lambda l: [int(x) for x in l.split(',')],
                        default=None,
                        metavar='ID',
                        help='the integer ID(s) of the instrument(s)')
    parser.add_argument('-p',
                        '--program',
                        type=lambda l: [int(x) for x in l.split(',')],
                        default=None,
                        metavar='PRG',
                        help='the MIDI program number(s)')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--drums',
                       action='store_true',
                       help='include only drums')
    group.add_argument('--no-drums',
                       action='store_false',
                       dest='drums',
                       help='exclude drums')
    group.set_defaults(drums=None)
    args = parser.parse_args()

    tf.enable_eager_execution()

    with TFRecordWriter(args.output_file) as writer:
        for record in tf.data.TFRecordDataset(args.input_files):
            sequence = music_pb2.NoteSequence.FromString(record.numpy())
            filter_sequence(sequence,
                            instrument_re=args.instrument_re,
                            instrument_ids=args.instrument_id,
                            programs=args.program,
                            drums=args.drums)
            writer.write(sequence.SerializeToString())
Example #6
0
def write_examples_to_tfrecord(examples,
                               label_list,
                               max_seq_length,
                               tokenizer,
                               output_file,
                               is_testing,
                               pbar_desc=None):
    """Write a set of `InputExample`s to a TFRecord file."""
    def create_int_feature(values):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

    label_map = {label: i for i, label in enumerate(label_list)}

    with TFRecordWriter(output_file) as writer:
        for example in tqdm(examples, desc=pbar_desc):
            feature = _convert_single_example(example=example,
                                              label_map=label_map,
                                              max_seq_length=max_seq_length,
                                              tokenizer=tokenizer,
                                              is_testing=is_testing)

            tf_features = {
                'input_ids':
                create_int_feature(feature.input_ids),
                'input_mask':
                create_int_feature(feature.input_mask),
                'segment_ids':
                create_int_feature(feature.segment_ids),
                'label_id':
                create_int_feature([feature.label_id]),
                'is_real_example':
                create_int_feature([int(feature.is_real_example)])
            }

            tf_example = tf.train.Example(features=tf.train.Features(
                feature=tf_features))
            writer.write(tf_example.SerializeToString())
Example #7
0
    def build(
        cls,
        dump_db: DumpDB,
        tokenizer: PreTrainedTokenizer,
        sentence_tokenizer: SentenceTokenizer,
        entity_vocab: EntityVocab,
        output_dir: str,
        max_seq_length: int,
        max_entity_length: int,
        max_mention_length: int,
        min_sentence_length: int,
        include_sentences_without_entities: bool,
        include_unk_entities: bool,
        pool_size: int,
        chunk_size: int,
        max_num_documents: int,
    ):

        target_titles = [
            title for title in dump_db.titles()
            if not (":" in title and title.lower().split(":")[0] in
                    ("image", "file", "category"))
        ]
        random.shuffle(target_titles)

        if max_num_documents is not None:
            target_titles = target_titles[:max_num_documents]

        max_num_tokens = max_seq_length - 2  # 2 for [CLS] and [SEP]

        tokenizer.save_pretrained(output_dir)

        entity_vocab.save(os.path.join(output_dir, ENTITY_VOCAB_FILE))
        number_of_items = 0
        tf_file = os.path.join(output_dir, DATASET_FILE)
        options = tf.io.TFRecordOptions(
            tf.compat.v1.io.TFRecordCompressionType.GZIP)
        with TFRecordWriter(tf_file, options=options) as writer:
            with tqdm(total=len(target_titles)) as pbar:
                initargs = (
                    dump_db,
                    tokenizer,
                    sentence_tokenizer,
                    entity_vocab,
                    max_num_tokens,
                    max_entity_length,
                    max_mention_length,
                    min_sentence_length,
                    include_sentences_without_entities,
                    include_unk_entities,
                )
                with closing(
                        Pool(pool_size,
                             initializer=WikipediaPretrainingDataset.
                             _initialize_worker,
                             initargs=initargs)) as pool:
                    for ret in pool.imap(
                            WikipediaPretrainingDataset._process_page,
                            target_titles,
                            chunksize=chunk_size):
                        for data in ret:
                            writer.write(data)
                            number_of_items += 1
                        pbar.update()

        with open(os.path.join(output_dir, METADATA_FILE),
                  "w") as metadata_file:
            json.dump(
                dict(
                    number_of_items=number_of_items,
                    max_seq_length=max_seq_length,
                    max_entity_length=max_entity_length,
                    max_mention_length=max_mention_length,
                    min_sentence_length=min_sentence_length,
                    tokenizer_class=tokenizer.__class__.__name__,
                    language=dump_db.language,
                ),
                metadata_file,
                indent=2,
            )
Example #8
0
def merge_shards(filename, num_shards_to_merge, out_tmp_dir, batch_size, ensure_batch_multiple):
  np.random.seed([int.from_bytes(os.urandom(4), byteorder='little') for i in range(5)])

  tfoptions = TFRecordOptions(compression_type = 'ZLIB')
  record_writer = TFRecordWriter(filename,tfoptions)

  binaryInputNCHWPackeds = []
  globalInputNCs = []
  policyTargetsNCMoves = []
  globalTargetsNCs = []
  scoreDistrNs = []
  valueTargetsNCHWs = []

  for input_idx in range(num_shards_to_merge):
    shard_filename = os.path.join(out_tmp_dir, str(input_idx) + ".npz")
    with np.load(shard_filename) as npz:
      assert(set(npz.keys()) == set(keys))

      binaryInputNCHWPacked = npz["binaryInputNCHWPacked"]
      globalInputNC = npz["globalInputNC"]
      policyTargetsNCMove = npz["policyTargetsNCMove"].astype(np.float32)
      globalTargetsNC = npz["globalTargetsNC"]
      scoreDistrN = npz["scoreDistrN"].astype(np.float32)
      valueTargetsNCHW = npz["valueTargetsNCHW"].astype(np.float32)

      binaryInputNCHWPackeds.append(binaryInputNCHWPacked)
      globalInputNCs.append(globalInputNC)
      policyTargetsNCMoves.append(policyTargetsNCMove)
      globalTargetsNCs.append(globalTargetsNC)
      scoreDistrNs.append(scoreDistrN)
      valueTargetsNCHWs.append(valueTargetsNCHW)

  ###
  #WARNING - if adding anything here, also add it to joint_shuffle below!
  ###
  binaryInputNCHWPacked = np.concatenate(binaryInputNCHWPackeds)
  globalInputNC = np.concatenate(globalInputNCs)
  policyTargetsNCMove = np.concatenate(policyTargetsNCMoves)
  globalTargetsNC = np.concatenate(globalTargetsNCs)
  scoreDistrN = np.concatenate(scoreDistrNs)
  valueTargetsNCHW = np.concatenate(valueTargetsNCHWs)

  num_rows = binaryInputNCHWPacked.shape[0]
  assert(globalInputNC.shape[0] == num_rows)
  assert(policyTargetsNCMove.shape[0] == num_rows)
  assert(globalTargetsNC.shape[0] == num_rows)
  assert(scoreDistrN.shape[0] == num_rows)
  assert(valueTargetsNCHW.shape[0] == num_rows)

  [binaryInputNCHWPacked,globalInputNC,policyTargetsNCMove,globalTargetsNC,scoreDistrN,valueTargetsNCHW] = (
    joint_shuffle_take_first_n(num_rows,[binaryInputNCHWPacked,globalInputNC,policyTargetsNCMove,globalTargetsNC,scoreDistrN,valueTargetsNCHW])
  )

  assert(binaryInputNCHWPacked.shape[0] == num_rows)
  assert(globalInputNC.shape[0] == num_rows)
  assert(policyTargetsNCMove.shape[0] == num_rows)
  assert(globalTargetsNC.shape[0] == num_rows)
  assert(scoreDistrN.shape[0] == num_rows)
  assert(valueTargetsNCHW.shape[0] == num_rows)

  #Just truncate and lose the batch at the end, it's fine
  num_batches = (num_rows // (batch_size * ensure_batch_multiple)) * ensure_batch_multiple
  for i in range(num_batches):
    start = i*batch_size
    stop = (i+1)*batch_size

    example = tfrecordio.make_tf_record_example(
      binaryInputNCHWPacked,
      globalInputNC,
      policyTargetsNCMove,
      globalTargetsNC,
      scoreDistrN,
      valueTargetsNCHW,
      start,
      stop
    )
    record_writer.write(example.SerializeToString())

  jsonfilename = os.path.splitext(filename)[0] + ".json"
  with open(jsonfilename,"w") as f:
    json.dump({"num_rows":num_rows,"num_batches":num_batches},f)

  record_writer.close()
  return num_batches * batch_size
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


meta_file = "data/raw/photo.json"
image_dir = "data/raw/photos"
record_file = "data/preprocessed/yelp_photos_{}.tfrecords"
data = open(meta_file).readlines()
n = len(data)
split_idx = n * np.array([0, 0.16, 0.32, 0.48, 0.64, 0.80, 1.0])
split_idx = split_idx.astype('int32')
idx = np.arange(n)
np.random.shuffle(idx)

for i, split in enumerate(
        tqdm(['train1', 'train2', 'train3', 'train4', 'train5', 'test'])):
    with TFRecordWriter(record_file.format(split)) as writer:
        for j in tqdm(idx[split_idx[i]:split_idx[i + 1]]):
            datapoint = json.loads(data[j])
            image_file = os.path.join(image_dir,
                                      datapoint["photo_id"] + '.jpg')
            image = tf.io.decode_jpeg(tf.io.read_file(image_file, 'rb'))

            h, w, _ = image.shape
            res = min(h, w)
            h0, w0 = (h - res) // 2, (w - res) // 2
            image = tf.image.crop_to_bounding_box(image, h0, w0, res, res)
            image_string = tf.io.encode_jpeg(image)
            label = YELP_CLASSES[datapoint["label"]]

            feature = {
                'image': _bytes_feature(image_string.numpy()),
Example #10
0
import addressbook_pb2
import tensorflow as tf
from tensorflow.io import TFRecordWriter, read_file
from tensorflow.data import TFRecordDataset
from google.protobuf.json_format import MessageToDict
from google.protobuf.json_format import MessageToJson

if __name__ == '__main__':
    filename = "test.tfrecords"

    # Create a TFRecord file
    with TFRecordWriter(filename) as writer:
        for i in range(1, 5):
            person = addressbook_pb2.Person()
            person.id = 1000 + i
            person.name = "John Doe " + str(i)
            phone = person.phones.add()
            phone.type = addressbook_pb2.Person.PhoneType.HOME
            phone.number = "333-431" + str(i)
            phone = person.phones.add()
            phone.type = addressbook_pb2.Person.PhoneType.WORK
            phone.number = "444-431" + str(i)

            writer.write(person.SerializeToString())

    # Read back TFRecord file
    tf.enable_eager_execution()
    dataset = TFRecordDataset(filename)
    for record in dataset.take(2):
        person = addressbook_pb2.Person()
        person.ParseFromString(record.numpy())
Example #11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_dir', metavar='INPUT-DIR')
    parser.add_argument('output_prefix', metavar='OUTPUT-PREFIX')

    parser.add_argument('-b', '--bars-per-segment', type=lambda l: [int(x) for x in l.split(',')],
                        default=[8], metavar='NUM',
                        help='the number of bars per segment (default: 8)')
    parser.add_argument('-n', '--min-notes-per-segment', type=int, default=1, metavar='NUM',
                        help='discard segments with less than the given number of notes '
                             '(default: 1)')
    parser.add_argument('-t', '--force-tempo', type=float, default=None, metavar='BPM',
                        help='warp the sequences to match the given tempo')
    parser.add_argument('--skip-bars', type=int, default=0, metavar='NUM',
                        help='skip the given number of bars at the beginning')
    parser.add_argument('--merge-instruments', action='store_true',
                        help='causes equivalent instruments to be merged')

    args = parser.parse_args()

    # Collect all paths
    paths = []
    for dir_path, dirnames, filenames in os.walk(args.input_dir):
        dirnames.sort()
        filenames.sort()
        for fname in filenames:
            paths.append(os.path.join(dir_path, fname))

    metadata = []
    with TFRecordWriter(args.output_prefix + '.tfrecord') as writer:
        for path in paths:
            rel_path = os.path.relpath(path, args.input_dir)
            print(rel_path, file=sys.stderr, flush=True)
            midi = pretty_midi.PrettyMIDI(path)
            sequence = midi_io.midi_to_note_sequence(midi)
            sequence.filename = rel_path

            # Record the downbeat times so that they get updated by normalize_tempo later
            for time in midi.get_downbeats():
                annotation = sequence.text_annotations.add()
                annotation.time = time
                annotation.annotation_type = BEAT
                annotation.text = DOWNBEAT

            if args.merge_instruments:
                merge_equivalent_instruments(sequence)

            if args.force_tempo:
                sequence = note_sequence_utils.normalize_tempo(sequence, args.force_tempo)

            # Get the updated downbeats
            downbeats = [a.time for a in sequence.text_annotations
                         if (a.annotation_type, a.text) == (BEAT, DOWNBEAT)]
            del sequence.text_annotations[-len(downbeats):]

            for start, end, segment in note_sequence_utils.split_on_downbeats(
                    sequence, downbeats=downbeats,
                    bars_per_segment=args.bars_per_segment, skip_bars=args.skip_bars,
                    min_notes_per_segment=args.min_notes_per_segment, include_span=True):
                writer.write(segment.SerializeToString())
                metadata.append({
                    'filename': rel_path,
                    'segment_id': f'bar_{start}-{end}'
                })

    with gzip.open(args.output_prefix + '_meta.json.gz', 'wt', encoding='utf-8') as f:
        json.dump(metadata, f, separators=(',', ':'))