def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read("data/train.tsv") validation_data = reader.read("data/dev.tsv") return training_data, validation_data
def read_data( reader: DatasetReader, train_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv", valid_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv" ) -> Tuple[Iterable[Instance], Iterable[Instance]]: logger.critical("Reading the data. Lazy variable set to {}".format( reader.lazy)) start_time = time.time() '''Expect: that this is the only time it is called''' reader.mode = "train" training_data = reader.read(train_data_path) # instead, we will set the examples differently here reader.mode = "valid" validation_data = reader.read( valid_data_path) #need to unlimit the examples here... logger.critical( "Finished the call to read the data. Time took {}".format(time.time() - start_time)) return training_data, validation_data
def read_data(reader: DatasetReader, tgt_domain: str, input_path: str, domains: List) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = None for domain in domains: if domain != tgt_domain: if training_data == None: training_data = reader.read(input_path + domain + '/' + domain + '_neg.txt') else: training_data += reader.read(input_path + domain + '/' + domain + '_neg.txt') valid_test_data = reader.read(input_path + tgt_domain + '/' + tgt_domain + '_neg.txt') as_per_percent = int(len(valid_test_data) * 0.25) valid_size = 2000 if as_per_percent >= 2000 else as_per_percent validation_data = valid_test_data[:valid_size] test_data = valid_test_data[valid_size:] training_data = AllennlpDataset(training_data) validation_data = AllennlpDataset(validation_data) test_data = AllennlpDataset(test_data) print("train:", len(training_data), "validation:", len(validation_data), "test:", len(test_data)) return training_data, validation_data, test_data
def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read("quick_start/data/movie_review/train.tsv") validation_data = reader.read("quick_start/data/movie_review/dev.tsv") return training_data, validation_data
def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read(TRAIN_PATH) validation_data = reader.read(DEV_PATH) return training_data, validation_data
def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read("/path/to/your/training/data") validation_data = reader.read("/path/to/your/validation/data") return training_data, validation_data
def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read("./data/sample/train_dataset.json") validation_data = reader.read( "./data/sample/train_dataset.json") # TODO: same data return training_data, validation_data
def read_data( reader: DatasetReader, train_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv", valid_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv" ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read(train_data_path) validation_data = reader.read(valid_data_path) return training_data, validation_data
def read_data(reader: DatasetReader) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") training_data = reader.read('../data/snips/utterances_train_features.txt') validation_data = reader.read('../data/snips/utterances_valid_features.txt') training_data = AllennlpDataset(training_data) validation_data = AllennlpDataset(validation_data) print("train:",len(training_data), "validation:", len(validation_data)) return training_data, validation_data
def read_data( train_path: str, val_path: str, train_reader: DatasetReader, val_reader: DatasetReader = None ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") print(type(train_reader), train_path) training_data = train_reader.read(train_path) if val_reader is None: validation_data = train_reader.read(val_path) else: validation_data = val_reader.read(val_path) return training_data, validation_data
def create_onepass_generator(iterator: DataIterator, dataset_reader: DatasetReader) -> Iterator: generator = iterator(dataset_reader.read("dummy_path"), num_epochs=1, shuffle=False) return generator
def run(self, reader: DatasetReader, splits: Dict[str, str]) -> DatasetDict: # type: ignore """ * `reader` specifies the old-school dataset reader to use. * `splits` maps the names of the splits to the filenames to use for the dataset reader. It might look like this: ``` { "train": "/path/to/train.json", "validation": "/path/to/validation.json" } ``` """ instances_map: Dict[str, Sequence[Instance]] = { split_name: list(tqdm(reader.read(path), desc=f"Reading {path}")) for split_name, path in splits.items() } vocab = Vocabulary.from_instances( itertools.chain(*instances_map.values())) # index all the instances with the vocab for split_name, instances in instances_map.items(): for instance in tqdm(instances, desc=f"Indexing {split_name}"): instance.index_fields(vocab) return DatasetDict(splits=instances_map, vocab=vocab)
def train( model: Model, binary_class: str, train_data: DatasetType, valid_reader: DatasetReader, vocab: Vocabulary, optimizer_type: str, optimizer_learning_rate: float, optimizer_weight_decay: float, batch_size: int, patience: int, num_epochs: int, device: str, ) -> Tuple[Model, MetricsType]: train_reader = BIODatasetReader( ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class), token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), }, ) train_dataset = train_reader.read('tmp.txt') valid_dataset = valid_reader.read('tmp.txt') cuda_device = -1 if device == 'cuda': cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD( model.parameters(), lr=optimizer_learning_rate, weight_decay=optimizer_weight_decay, ) iterator = BucketIterator( batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")], ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, patience=patience, num_epochs=num_epochs, cuda_device=cuda_device, validation_metric='f1-measure-overall', ) metrics = trainer.train() return model, metrics
def read_data( reader: DatasetReader ) -> Tuple[Iterable[Instance], Iterable[Instance]]: print("Reading data") with open('News_Category_Dataset_v2.json') as f: data = [json.loads(item) for item in f] train, val = train_test_split(data, test_size=0.2) train, test = train_test_split(train, test_size=0.25) print(len(train)) print(len(val)) print(len(test)) training_data = reader.read(json.dumps(train)) validation_data = reader.read(json.dumps(val)) test_data = reader.read(json.dumps(test)) return training_data, validation_data, test_data
def get_prediction(model: Model, reader: DatasetReader, data_path: str, batch_size: int = 1024): model.eval() data = reader.read(data_path) predictor = Seq2SeqPredictor(model, reader) for ins in batch(data, batch_size): yield from predictor.predict_batch_instance(ins)
def read_multi_path_as_multiple_iters(reader: DatasetReader, multi_path_str: Union[str, Iterator[str]], file_pattern: str): input_paths, path_exists, has_available_path = solve_multi_path( multi_path_str, file_pattern=file_pattern) if has_available_path: return [(reader.read(input_path), input_path) for input_path, path_exist in zip(input_paths, path_exists) if path_exist]
def read_data(reader: DatasetReader, train_file_name, valid_file_name): train_data_instances = reader.read(os.path.join(DATA_PATH, train_file_name)) valid_data_instances = reader.read(os.path.join(DATA_PATH, valid_file_name)) return train_data_instances, valid_data_instances
def read_data(reader: DatasetReader, file: str) -> Iterable[Instance]: print(f"Reading data from {file}") training_data = reader.read(file) return training_data