def _load_shard(self, shard_name): logging.info('Loading data from: %s' % shard_name) with open(shard_name) as f: sentences = f.readlines() if self._reverse: sentences_reverse = [] for sentence in sentences: splitted = sentence.split() splitted.reverse() sentences_reverse.append(' '.join(splitted)) sentences = sentences_reverse if self._shuffle_on_load: random.shuffle(sentences) ids = [ self.vocab.encode(sentence, self._reverse) for sentence in sentences ] if self._use_char_inputs: chars_ids = [ self.vocab.encode_chars(sentence, self._reverse) for sentence in sentences ] else: chars_ids = [None] * len(ids) logging.info('Loaded %d sentences.' % len(ids)) return list(zip(ids, chars_ids))
def file_based_convert_examples_to_features(self, input_file, output_file): """"Convert a set of `InputExample`s to a MindDataset file.""" examples = self._read_tsv(input_file) writer = FileWriter(file_name=output_file, shard_num=1) nlp_schema = { "input_ids": {"type": "int64", "shape":[-1]}, "input_mask": {"type": "int64", "shape":[-1]}, "segment_ids": {"type": "int64", "shape":[-1]}, "label_ids": {"type": "int64", "shape":[-1]}, } writer.add_schema(nlp_schema, "proprocessed classification dataset") data = [] for index, example in enumerate(examples): if index % 10000 == 0: logging.info("Writing example %d of %d" % (index, len(examples))) record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer) sample = { "input_ids": np.array(record.input_ids, dtype=np.int64), "input_mask": np.array(record.input_mask, dtype=np.int64), "segment_ids": np.array(record.segment_ids, dtype=np.int64), "label_ids": np.array([record.label_id], dtype=np.int64), } data.append(sample) writer.write_raw_data(data) writer.commit()
def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False, reverse=False): self._vocab = vocab self._all_shards = glob.glob(filepattern) logging.info('Found %d shards at %s' % (len(self._all_shards), filepattern)) self._shards_to_choose = [] self._reverse = reverse self._test = test self._shuffle_on_load = shuffle_on_load self._use_char_inputs = hasattr(vocab, 'encode_chars') self._ids = self._load_random_shard()