def test_label_studio_predictions_visualization(): """Test creation of LabelStudioImageClassificationInput and Datamodule from images.""" download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/data.zip" ) datamodule = ImageClassificationData.from_labelstudio( train_export_json="data/project.json", train_data_folder="data/upload/", test_export_json="data/project.json", test_data_folder="data/upload/", val_split=0.5, batch_size=1, ) assert datamodule app = launch_app(datamodule) predictions = [[0, 1], [1, 0]] vis_predictions = app.show_predictions(predictions) assert len(vis_predictions) == 4 assert vis_predictions[0]["result"][0]["id"] != vis_predictions[3][ "result"][0]["id"] assert vis_predictions[1]["result"][0]["id"] != vis_predictions[2][ "result"][0]["id"] tasks_predictions = app.show_tasks(predictions) assert len(tasks_predictions) == 4 tasks_predictions_json = app.show_tasks(predictions, export_json="data/project.json") assert tasks_predictions_json
def test_input_labelstudio_text(): """Test creation of LabelStudioTextClassificationInput and Datamodule from text.""" download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/text_data.zip", "./data/") data = { "data_folder": "data/upload/", "export_json": "data/project.json", "multi_label": False, } train_data, test_data = LabelStudioInput._split_train_test_data(data) train_data, val_data = LabelStudioInput._split_train_val_data(train_data, split=0.2) train = LabelStudioTextClassificationInput(RunningStage.TRAINING, train_data) val = LabelStudioTextClassificationInput(RunningStage.VALIDATING, val_data, parameters=train.parameters) test = LabelStudioTextClassificationInput(RunningStage.TESTING, test_data, parameters=train.parameters) train_sample = train[0] val_sample = val[0] assert train_sample assert val_sample assert len(test) == 0
def test_pointcloud_object_detection_data(tmpdir): seed_everything(52) download_data("https://pl-flash-data.s3.amazonaws.com/KITTI_micro.zip", tmpdir) dm = PointCloudObjectDetectorData.from_folders( train_folder=join(tmpdir, "KITTI_Micro", "Kitti", "train")) class MockModel(PointCloudObjectDetector): def training_step(self, batch, batch_idx: int): assert isinstance(batch, ObjectDetectBatchCollator) assert len(batch.point) == 2 assert batch.point[0][1].shape == torch.Size([4]) assert len(batch.bboxes) > 1 assert batch.attr[0]["name"] in ("000000.bin", "000001.bin") assert batch.attr[1]["name"] in ("000000.bin", "000001.bin") num_classes = 19 model = MockModel(backbone="pointpillars_kitti", num_classes=num_classes) trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=0) trainer.fit(model, dm) predict_path = join(tmpdir, "KITTI_Micro", "Kitti", "predict") model.eval() predictions = model.predict([join(predict_path, "scans/000000.bin")]) assert predictions[0][DefaultDataKeys.INPUT].shape[1] == 4 assert len(predictions[0][DefaultDataKeys.PREDS]) == 158
def test_datamodule_labelstudio_text(): """Test creation of LabelStudioTextClassificationInput and Datamodule from text.""" download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/text_data.zip", "./data/") datamodule = TextClassificationData.from_labelstudio( train_export_json="data/project.json", data_folder="data/upload/", batch_size=4, ) assert datamodule
def from_urban8k( batch_size: int = 4, **data_module_kwargs, ) -> AudioClassificationData: """Downloads and loads the Urban 8k sounds images data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/urban8k_images.zip", "./data") return AudioClassificationData.from_folders( train_folder="data/urban8k_images/train", val_folder="data/urban8k_images/val", batch_size=batch_size, **data_module_kwargs, )
def from_kitti( batch_size: int = 4, **data_module_kwargs, ) -> PointCloudObjectDetectorData: """Downloads and loads the KITTI data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/KITTI_tiny.zip", "data/") return PointCloudObjectDetectorData.from_folders( train_folder="data/KITTI_Tiny/Kitti/train", val_folder="data/KITTI_Tiny/Kitti/val", batch_size=batch_size, **data_module_kwargs, )
def from_kitti( batch_size: int = 4, **data_module_kwargs, ) -> PointCloudSegmentationData: """Downloads and loads the semantic KITTI data set.""" download_data( "https://pl-flash-data.s3.amazonaws.com/SemanticKittiTiny.zip", "data/") return PointCloudSegmentationData.from_folders( train_folder="data/SemanticKittiTiny/train", val_folder="data/SemanticKittiTiny/val", batch_size=batch_size, **data_module_kwargs, )
def test_datamodule_labelstudio_video(): """Test creation of Datamodule from video.""" download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/video_data.zip" ) datamodule = VideoClassificationData.from_labelstudio( export_json="data/project.json", data_folder="data/upload/", clip_sampler="uniform", clip_duration=1, decode_audio=False, batch_size=1, ) assert datamodule
def from_squad( batch_size: int = 4, **data_module_kwargs, ) -> QuestionAnsweringData: """Downloads and loads a tiny subset of the squad V2 data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/squad_tiny.zip", "./data/") return QuestionAnsweringData.from_squad_v2( train_file="./data/squad_tiny/train.json", val_file="./data/squad_tiny/val.json", batch_size=batch_size, **data_module_kwargs, )
def from_imdb( batch_size: int = 4, **data_module_kwargs, ) -> TextClassificationData: """Downloads and loads the IMDB sentiment classification data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/imdb.zip", "./data/") return TextClassificationData.from_csv( "review", "sentiment", train_file="data/imdb/train.csv", val_file="data/imdb/valid.csv", batch_size=batch_size, **data_module_kwargs, )
def from_hymenoptera( batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> ImageClassificationData: """Downloads and loads the Hymenoptera (Ants, Bees) data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip", "./data") return ImageClassificationData.from_folders( train_folder="data/hymenoptera_data/train/", val_folder="data/hymenoptera_data/val/", batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def test_datamodule_labelstudio_image(): """Test creation of LabelStudioImageClassificationInput and Datamodule from images.""" download_data( "https://label-studio-testdata.s3.us-east-2.amazonaws.com/lightning-flash/data.zip" ) datamodule = ImageClassificationData.from_labelstudio( train_export_json="data/project.json", train_data_folder="data/upload/", test_export_json="data/project.json", test_data_folder="data/upload/", val_split=0.5, batch_size=1, ) assert datamodule
def from_toxic( val_split: float = 0.1, batch_size: int = 4, **data_module_kwargs, ) -> TextClassificationData: """Downloads and loads the Jigsaw toxic comments data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/jigsaw_toxic_comments.zip", "./data") return TextClassificationData.from_csv( "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], train_file="data/jigsaw_toxic_comments/train.csv", val_split=val_split, batch_size=batch_size, **data_module_kwargs, )
def from_coco_128( batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> StyleTransferData: """Downloads and loads the COCO 128 data set.""" download_data( "https://github.com/zhiqwang/yolov5-rt-stack/releases/download/v0.3.0/coco128.zip", "data/") return StyleTransferData.from_folders( train_folder="data/coco128/images/train2017/", batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def titanic_data_download(path: str, predict_size: float = 0.1) -> None: if not os.path.exists(path): os.makedirs(path) path_data = os.path.join(path, "titanic.csv") download_data("https://pl-flash-data.s3.amazonaws.com/titanic.csv", path_data) if set(os.listdir(path)) != {"predict.csv", "titanic.csv"}: assert 0 < predict_size < 1 df = pd.read_csv(path_data) df_train, df_predict = train_test_split(df, test_size=predict_size) df_train.to_csv(path_data) df_predict = df_predict.drop(columns=["Survived"]) df_predict.to_csv(os.path.join(path, "predict.csv"))
def from_xsum( batch_size: int = 4, num_workers: int = 0, **input_transform_kwargs, ) -> SummarizationData: """Downloads and loads the XSum data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/") return SummarizationData.from_csv( "input", "target", train_file="data/xsum/train.csv", val_file="data/xsum/valid.csv", batch_size=batch_size, num_workers=num_workers, **input_transform_kwargs, )
def from_movie_posters( batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> ImageClassificationData: """Downloads and loads the movie posters genre classification data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/movie_posters.zip", "./data") return ImageClassificationData.from_csv( "Id", ["Action", "Romance", "Crime", "Thriller", "Adventure"], train_file="data/movie_posters/train/metadata.csv", val_file="data/movie_posters/val/metadata.csv", batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_coco_128( val_split: float = 0.1, transform_kwargs: Optional[Dict[str, Any]] = None, batch_size: int = 1, **data_module_kwargs, ) -> ObjectDetectionData: """Downloads and loads the COCO 128 data set.""" download_data("https://github.com/zhiqwang/yolov5-rt-stack/releases/download/v0.3.0/coco128.zip", "data/") return ObjectDetectionData.from_coco( train_folder="data/coco128/images/train2017/", train_ann_file="data/coco128/annotations/instances_train2017.json", val_split=val_split, transform_kwargs=dict(image_size=(128, 128)) if transform_kwargs is None else transform_kwargs, batch_size=batch_size, **data_module_kwargs, )
def from_titanic( batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> TabularClassificationData: """Downloads and loads the Titanic data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", "./data") return TabularClassificationData.from_csv( ["Sex", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], "Fare", target_fields="Survived", train_file="data/titanic/titanic.csv", val_split=0.1, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_wmt_en_ro( batch_size: int = 4, num_workers: int = 0, **input_transform_kwargs, ) -> TranslationData: """Downloads and loads the WMT EN RO data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/wmt_en_ro.zip", "./data") return TranslationData.from_csv( "input", "target", train_file="data/wmt_en_ro/train.csv", val_file="data/wmt_en_ro/valid.csv", batch_size=batch_size, num_workers=num_workers, **input_transform_kwargs, )
def from_titanic( val_split: float = 0.1, batch_size: int = 4, **data_module_kwargs, ) -> TabularRegressionData: """Downloads and loads the Titanic data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", "./data") return TabularRegressionData.from_csv( ["Sex", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], None, target_field="Fare", train_file="data/titanic/titanic.csv", val_split=val_split, batch_size=batch_size, **data_module_kwargs, )
def from_kinetics( clip_sampler: str = "uniform", clip_duration: int = 1, decode_audio: bool = False, batch_size=1, **data_module_kwargs, ) -> VideoClassificationData: """Downloads and loads the Kinetics data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/kinetics.zip", "./data") return VideoClassificationData.from_folders( train_folder=os.path.join(os.getcwd(), "data/kinetics/train"), val_folder=os.path.join(os.getcwd(), "data/kinetics/val"), clip_sampler=clip_sampler, clip_duration=clip_duration, decode_audio=decode_audio, batch_size=batch_size, **data_module_kwargs, )
def from_xsum( backbone: str = "sshleifer/distilbart-xsum-1-1", batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> SummarizationData: """Downloads and loads the XSum data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/") return SummarizationData.from_csv( "input", "target", train_file="data/xsum/train.csv", val_file="data/xsum/valid.csv", backbone=backbone, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_imdb( backbone: str = "prajjwal1/bert-medium", batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> TextClassificationData: """Downloads and loads the IMDB sentiment classification data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/imdb.zip", "./data/") return TextClassificationData.from_csv( "review", "sentiment", train_file="data/imdb/train.csv", val_file="data/imdb/valid.csv", backbone=backbone, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_coco_128( val_split: float = 0.1, batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> ObjectDetectionData: """Downloads and loads the COCO 128 data set.""" download_data( "https://github.com/zhiqwang/yolov5-rt-stack/releases/download/v0.3.0/coco128.zip", "data/") return ObjectDetectionData.from_coco( train_folder="data/coco128/images/train2017/", train_ann_file="data/coco128/annotations/instances_train2017.json", val_split=val_split, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_toxic( backbone: str = "unitary/toxic-bert", val_split: float = 0.1, batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> TextClassificationData: """Downloads and loads the Jigsaw toxic comments data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/jigsaw_toxic_comments.zip", "./data") return TextClassificationData.from_csv( "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], train_file="data/jigsaw_toxic_comments/train.csv", backbone=backbone, val_split=val_split, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def from_carla( num_classes: int = 21, val_split: float = 0.1, batch_size: int = 4, **data_module_kwargs, ) -> SemanticSegmentationData: """Downloads and loads the CARLA capture data set.""" download_data( "https://github.com/ongchinkiat/LyftPerceptionChallenge/releases/download/v0.1/carla-capture-20180513A.zip", "./data", ) return SemanticSegmentationData.from_folders( train_folder="data/CameraRGB", train_target_folder="data/CameraSeg", val_split=val_split, batch_size=batch_size, num_classes=num_classes, **data_module_kwargs, )
def from_timit( val_split: float = 0.1, batch_size: int = 4, num_workers: int = 0, **input_transform_kwargs, ) -> SpeechRecognitionData: """Downloads and loads the timit data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/timit_data.zip", "./data") return SpeechRecognitionData.from_json( "file", "text", train_file="data/timit/train.json", test_file="data/timit/test.json", val_split=val_split, batch_size=batch_size, num_workers=num_workers, **input_transform_kwargs, )
def from_wmt_en_ro( backbone: str = "Helsinki-NLP/opus-mt-en-ro", batch_size: int = 4, num_workers: int = 0, **preprocess_kwargs, ) -> TranslationData: """Downloads and loads the WMT EN RO data set.""" download_data("https://pl-flash-data.s3.amazonaws.com/wmt_en_ro.zip", "./data") return TranslationData.from_csv( "input", "target", train_file="data/wmt_en_ro/train.csv", val_file="data/wmt_en_ro/valid.csv", backbone=backbone, batch_size=batch_size, num_workers=num_workers, **preprocess_kwargs, )
def test_pointcloud_segmentation_data(tmpdir): seed_everything(52) download_data( "https://pl-flash-data.s3.amazonaws.com/SemanticKittiMicro.zip", tmpdir) datamodule = PointCloudSegmentationData.from_folders( train_folder=join(tmpdir, "SemanticKittiMicro", "train"), predict_folder=join(tmpdir, "SemanticKittiMicro", "predict"), batch_size=4, ) class MockModel(PointCloudSegmentation): def training_step(self, batch, batch_idx: int): assert batch[DataKeys.INPUT]["xyz"][0].shape == torch.Size( [2, 45056, 3]) assert batch[DataKeys.INPUT]["xyz"][1].shape == torch.Size( [2, 11264, 3]) assert batch[DataKeys.INPUT]["xyz"][2].shape == torch.Size( [2, 2816, 3]) assert batch[DataKeys.INPUT]["xyz"][3].shape == torch.Size( [2, 704, 3]) assert batch[DataKeys.INPUT]["labels"].shape == torch.Size( [2, 45056]) assert batch[DataKeys.INPUT]["labels"].max() == 19 assert batch[DataKeys.INPUT]["labels"].min() == 0 assert batch[DataKeys.METADATA][0]["name"] in ("00_000000", "00_000001") assert batch[DataKeys.METADATA][1]["name"] in ("00_000000", "00_000001") num_classes = 19 model = MockModel(backbone="randlanet", num_classes=num_classes) trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=0) trainer.fit(model, datamodule=datamodule) predictions = trainer.predict(model, datamodule=datamodule)[0] assert predictions[0][DataKeys.INPUT].shape == torch.Size([45056, 3]) assert predictions[0][DataKeys.PREDS].shape == torch.Size([45056, 19]) assert predictions[0][DataKeys.TARGET].shape == torch.Size([45056])