Beispiel #1
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("data/train.tsv")
    validation_data = reader.read("data/dev.tsv")
    return training_data, validation_data
Beispiel #2
0
def read_data(
    reader: DatasetReader,
    train_data_path:
    str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
    valid_data_path:
    str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv"
) -> Tuple[Iterable[Instance], Iterable[Instance]]:

    logger.critical("Reading the data. Lazy variable set to {}".format(
        reader.lazy))
    start_time = time.time()
    '''Expect: that this is the only time it is called'''
    reader.mode = "train"
    training_data = reader.read(train_data_path)

    # instead, we will set the examples differently here
    reader.mode = "valid"
    validation_data = reader.read(
        valid_data_path)  #need to unlimit the examples here...

    logger.critical(
        "Finished the call to read the data. Time took {}".format(time.time() -
                                                                  start_time))

    return training_data, validation_data
Beispiel #3
0
def read_data(reader: DatasetReader, tgt_domain: str, input_path: str,
              domains: List) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")

    training_data = None
    for domain in domains:
        if domain != tgt_domain:
            if training_data == None:
                training_data = reader.read(input_path + domain + '/' +
                                            domain + '_neg.txt')
            else:
                training_data += reader.read(input_path + domain + '/' +
                                             domain + '_neg.txt')

    valid_test_data = reader.read(input_path + tgt_domain + '/' + tgt_domain +
                                  '_neg.txt')

    as_per_percent = int(len(valid_test_data) * 0.25)
    valid_size = 2000 if as_per_percent >= 2000 else as_per_percent

    validation_data = valid_test_data[:valid_size]
    test_data = valid_test_data[valid_size:]

    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    test_data = AllennlpDataset(test_data)

    print("train:", len(training_data), "validation:", len(validation_data),
          "test:", len(test_data))
    return training_data, validation_data, test_data
Beispiel #4
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("quick_start/data/movie_review/train.tsv")
    validation_data = reader.read("quick_start/data/movie_review/dev.tsv")
    return training_data, validation_data
Beispiel #5
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(TRAIN_PATH)
    validation_data = reader.read(DEV_PATH)
    return training_data, validation_data
Beispiel #6
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("/path/to/your/training/data")
    validation_data = reader.read("/path/to/your/validation/data")
    return training_data, validation_data
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read("./data/sample/train_dataset.json")
    validation_data = reader.read(
        "./data/sample/train_dataset.json")  # TODO: same data
    return training_data, validation_data
Beispiel #8
0
def read_data(
    reader: DatasetReader,
        train_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/train/listfile.csv",
        valid_data_path: str = "/scratch/gobi1/johnchen/new_git_stuff/multimodal_fairness/data/in-hospital-mortality/test/listfile.csv"
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(train_data_path)
    validation_data = reader.read(valid_data_path)
    return training_data, validation_data
def read_data(reader: DatasetReader) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    
    training_data = reader.read('../data/snips/utterances_train_features.txt')
    validation_data = reader.read('../data/snips/utterances_valid_features.txt')
    
    training_data = AllennlpDataset(training_data)
    validation_data = AllennlpDataset(validation_data)
    
    print("train:",len(training_data), "validation:", len(validation_data))
    return training_data, validation_data
Beispiel #10
0
def read_data(
    train_path: str,
    val_path: str,
    train_reader: DatasetReader,
    val_reader: DatasetReader = None
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    print(type(train_reader), train_path)
    training_data = train_reader.read(train_path)
    if val_reader is None:
        validation_data = train_reader.read(val_path)
    else:
        validation_data = val_reader.read(val_path)
    return training_data, validation_data
def create_onepass_generator(iterator: DataIterator,
                             dataset_reader: DatasetReader) -> Iterator:
    generator = iterator(dataset_reader.read("dummy_path"),
                         num_epochs=1,
                         shuffle=False)

    return generator
Beispiel #12
0
    def run(self, reader: DatasetReader,
            splits: Dict[str, str]) -> DatasetDict:  # type: ignore
        """
        * `reader` specifies the old-school dataset reader to use.
        * `splits` maps the names of the splits to the filenames to use for the
          dataset reader. It might look like this:
          ```
          {
              "train": "/path/to/train.json",
              "validation": "/path/to/validation.json"
          }
          ```
        """
        instances_map: Dict[str, Sequence[Instance]] = {
            split_name: list(tqdm(reader.read(path), desc=f"Reading {path}"))
            for split_name, path in splits.items()
        }
        vocab = Vocabulary.from_instances(
            itertools.chain(*instances_map.values()))

        # index all the instances with the vocab
        for split_name, instances in instances_map.items():
            for instance in tqdm(instances, desc=f"Indexing {split_name}"):
                instance.index_fields(vocab)

        return DatasetDict(splits=instances_map, vocab=vocab)
Beispiel #13
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
Beispiel #14
0
def read_data(
        reader: DatasetReader
) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")

    with open('News_Category_Dataset_v2.json') as f:
        data = [json.loads(item) for item in f]
    train, val = train_test_split(data, test_size=0.2)
    train, test = train_test_split(train, test_size=0.25)
    print(len(train))
    print(len(val))
    print(len(test))

    training_data = reader.read(json.dumps(train))
    validation_data = reader.read(json.dumps(val))
    test_data = reader.read(json.dumps(test))
    return training_data, validation_data, test_data
Beispiel #15
0
def get_prediction(model: Model,
                   reader: DatasetReader,
                   data_path: str,
                   batch_size: int = 1024):
    model.eval()
    data = reader.read(data_path)
    predictor = Seq2SeqPredictor(model, reader)

    for ins in batch(data, batch_size):
        yield from predictor.predict_batch_instance(ins)
Beispiel #16
0
def read_multi_path_as_multiple_iters(reader: DatasetReader,
                                      multi_path_str: Union[str,
                                                            Iterator[str]],
                                      file_pattern: str):
    input_paths, path_exists, has_available_path = solve_multi_path(
        multi_path_str, file_pattern=file_pattern)
    if has_available_path:
        return [(reader.read(input_path), input_path)
                for input_path, path_exist in zip(input_paths, path_exists)
                if path_exist]
Beispiel #17
0
def read_data(reader: DatasetReader, train_file_name, valid_file_name):
    train_data_instances = reader.read(os.path.join(DATA_PATH, train_file_name))
    valid_data_instances = reader.read(os.path.join(DATA_PATH, valid_file_name))
    return train_data_instances, valid_data_instances
def read_data(reader: DatasetReader, file: str) -> Iterable[Instance]:
    print(f"Reading data from {file}")
    training_data = reader.read(file)
    return training_data