def test_classification(model, testing_quakes, device, data_dir): """ test model on testing set :param model: trained pytorch model :param testing_quakes: Array of quakes to test :param device: pytorch device :param data_dir: direcory of triggered earthquake data :return: accurarcy, confusion matrix string, classification model """ ds_train = TriggeredEarthquake( data_dir=data_dir, testing_quakes=testing_quakes, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, mode=DatasetMode.INFERENCE, transform=triggered_earthquake_transform(random_trim_offset=False), ) ds_test = TriggeredEarthquake( data_dir=data_dir, testing_quakes=testing_quakes, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, mode=DatasetMode.TEST, transform=triggered_earthquake_transform(random_trim_offset=False)) train_loader = DataLoader(ds_train, batch_size=1, num_workers=10, shuffle=True) test_loader = DataLoader(ds_test, batch_size=1, num_workers=10, shuffle=True) svc = create_classifier(model, train_loader, type='svc', device=device) acc, cm = report_accurarcy(model, svc, test_loader, device=device) return acc, cm, svc
def test_test_mode(self): data_path = os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value) accepted_labels = ['positive', 'negative', 'chaos'] testing_quakes = ['eq01'] mode = DatasetMode.TEST train_ds = TriggeredEarthquake( data_dir=data_path, force_download=False, labels=accepted_labels, downloadable_data=DownloadableData.SAMPLE_DATA, mode=mode, testing_quakes=testing_quakes) for file in os.listdir( os.path.join(data_path, 'prepare_{}'.format(mode.value))): data = torch.load( os.path.join(data_path, 'prepare_{}'.format(mode.value), file)) assert data[ 'quake'] in testing_quakes, 'should only contain testing quake' total_raw_count = 0 for l in accepted_labels: for q in testing_quakes: count = len(os.listdir(os.path.join(data_path, 'raw', q, l))) total_raw_count += count ds_count = 0 for _ in train_ds: ds_count += 1 assert total_raw_count == ds_count, 'dataset should contain all examples'
def test_siamese_learning(self): embedding_size = 10 ds = TriggeredEarthquake( data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value), force_download=False, downloadable_data=DownloadableData.SAMPLE_DATA) ds = SiameseDataset(ds) dl = DataLoader(ds, batch_size=24, num_workers=1, shuffle=True) model = DilatedConvolutional(embedding_size=embedding_size, downsample=False) test_data, test_label = next(iter(dl)) params = filter(lambda p: p.requires_grad, model.parameters()) opt = torch.optim.Adam(params, lr=0.01) l = DeepClusteringLoss() test_data = test_data.view(-1, 1, test_data.shape[-1]) embedding_a = model(test_data) assert len(embedding_a[-1] ) == embedding_size, 'output should match embedding size' _loss_a = l(embedding_a, test_label.float()) for _ in range(4): for data, label in dl: data = data.view(-1, 1, data.shape[-1]) output = model(data) _loss = l(output, label.float()) _loss.backward() opt.step() embedding_b = model(test_data) _loss_b = l(embedding_b, test_label.float())
def test_download_and_preproces(self): ds = TriggeredEarthquake( data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value), force_download=False, downloadable_data=DownloadableData.SAMPLE_DATA) assert len(ds) > 0, 'files should exist' assert os.path.isdir( os.path.expanduser( '~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value)), 'data should exist'
def test_get_item(self): ds = TriggeredEarthquake( data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value), force_download=False, downloadable_data=DownloadableData.SAMPLE_DATA) dl = DataLoader(ds, batch_size=1, num_workers=1) sample = next(iter(dl)) assert isinstance(sample[0], torch.Tensor), 'data ouput should be tensor' assert np.sum(sample[1].numpy() ) == 1, 'one-hot encoding should contain exactly 1 class'
def test_paper_configuration(self): ds = TriggeredEarthquake( data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value), force_download=False, downloadable_data=DownloadableData.SAMPLE_DATA) dl = DataLoader(ds, batch_size=1, num_workers=1, shuffle=True) model = DilatedConvolutional(embedding_size=10) data, label = next(iter(dl)) summary(model, data.unsqueeze(1)) num_params = sum( [p.numel() for p in model.parameters() if p.requires_grad]) assert num_params == 7440, 'number of params should match papers description'
def train_dataset(self): transform = triggered_earthquake_transform(sampling_rate=20.0, max_freq=8.0, min_freq=2.0, corner=2, aug_types=None, aug_prob=0.5, target_length=8192, random_trim_offset=True) ds = TriggeredEarthquake( data_dir=os.path.expanduser( '~/.seisml/data/triggered_earthquakes'), force_download=False, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, labels=['positive', 'negative'], mode=DatasetMode.TRAIN, testing_quakes=['SAC_20021102_XF_prem'], transform=transform) return ds
def test_dataset_wrapping(self): ds = TriggeredEarthquake( data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value), force_download=False, downloadable_data=DownloadableData.SAMPLE_DATA ) batch_size = 4 ds = SiameseDataset(ds) dl = DataLoader(ds, batch_size=batch_size, num_workers=1) data, label = next(iter(dl)) data_batch = data.shape[0] label_batch = label.shape[0] data_channel = data.shape[1] label_channel = label.shape[1] assert data_batch == batch_size, 'data batch size should match' assert label_batch == batch_size, 'label batch size should match' assert data_channel == label_channel == 2, 'should have two channels' assert not np.allclose(data[0][0], data[0][1]), 'paired data points should not equal' assert not np.allclose(label[0][0], label[0][1]), 'paired labels should be different classifications'
def save_classifier(_): ''' create and save two svc classifiers in the model_dir - one with only training data - ont with all data :param _: :return: ''' # save classifier only trained on training data _, _, classifier = test_classification( model, gin.query_parameter('triggered_earthquake_dataset.testing_quakes'), device, gin.query_parameter('triggered_earthquake_dataset.data_dir')) with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier, f) # save classifier trained on all data (for running inference) ds = TriggeredEarthquake( data_dir=gin.query_parameter( 'triggered_earthquake_dataset.data_dir'), testing_quakes=[], downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, mode=DatasetMode.INFERENCE, transform=triggered_earthquake_transform(random_trim_offset=False), ) loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True) classifier_alldata = create_classifier(model, loader, type='svc', device=device) with open( os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier_alldata, f)
def train(prefix, epochs, batch_size, num_workers, embedding_size, num_layers, learning_rate, weight_decay, model_dir, run_dir): ts = datetime.now().strftime("%m_%d_%Y__%H_%M") run_name = '{}_{}'.format(prefix, ts) model_dir = os.path.join(model_dir, run_name) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') writer = SummaryWriter(os.path.join(run_dir, run_name)) ds_train = TriggeredEarthquake( mode=DatasetMode.TRAIN, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE) ds_test = TriggeredEarthquake( mode=DatasetMode.TEST, downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, transform=triggered_earthquake_transform(random_trim_offset=False)) # ds_train = SiameseDataset(ds_train) train_loader = DataLoader(ds_train, batch_size=batch_size, num_workers=num_workers, shuffle=True) test_loader = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers, shuffle=True) model = DilatedConvolutional(embedding_size=embedding_size, num_layers=num_layers) params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate, weight_decay=weight_decay) loss_fn = DeepClusteringLoss() trainer = create_engine(model, optimizer, loss_fn, device) evaluator = create_eval(model, {'dcl': Loss(loss_fn)}, device) summary( model, (1, gin.query_parameter('triggered_earthquake_transform.target_length'))) writer.add_graph(model, next(iter(train_loader))[0].unsqueeze(1).to(device)) save_handler = ModelCheckpoint(model_dir, prefix, n_saved=1, create_dir=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_handler, {'model': model}) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(_): """ report training loss :param _: :return: """ writer.add_scalar('Iter/train_loss', trainer.state.output, trainer.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(_): """ report training loss :param _: :return: """ evaluator.run(train_loader) loss = trainer.state.output writer.add_scalar('Loss/train', loss, trainer.state.epoch) print("Training Results - Epoch: {} Avg loss: {:.2f}".format( trainer.state.epoch, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def test_acc(_): """ report testing accurarcy :param _: :return: """ acc, cm, _, = test_classification( model, gin.query_parameter('triggered_earthquake_dataset.testing_quakes'), device, gin.query_parameter('triggered_earthquake_dataset.data_dir')) writer.add_scalar('Accurarcy/test', acc, trainer.state.epoch) print('Testing Accurarcy: {:.2f}'.format(acc)) print(cm) def report_embeddings(_): """ write embeddings to tensorboard :param _: :return: """ train_loader = DataLoader(ds_train, batch_size=1) test_loader = DataLoader(ds_test, batch_size=1) text_labels = gin.query_parameter( 'triggered_earthquake_dataset.labels') train_embeddings, train_labels = get_embeddings(model, train_loader, device=device) train_labels = [ text_labels[np.argmax(l)] for l in train_labels.squeeze(1) ] writer.add_embedding(train_embeddings.squeeze(1), metadata=train_labels, global_step=trainer.state.epoch, tag='train_embeddings') test_embeddings, test_labels = get_embeddings(model, test_loader, device=device) test_labels = [ text_labels[np.argmax(l)] for l in test_labels.squeeze(1) ] writer.add_embedding(test_embeddings.squeeze(1), metadata=test_labels, global_step=trainer.state.epoch, tag='test_embeddings') trainer.add_event_handler(Events.EPOCH_COMPLETED(once=1), report_embeddings) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=5), report_embeddings) @trainer.on(Events.COMPLETED) def save_classifier(_): ''' create and save two svc classifiers in the model_dir - one with only training data - ont with all data :param _: :return: ''' # save classifier only trained on training data _, _, classifier = test_classification( model, gin.query_parameter('triggered_earthquake_dataset.testing_quakes'), device, gin.query_parameter('triggered_earthquake_dataset.data_dir')) with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier, f) # save classifier trained on all data (for running inference) ds = TriggeredEarthquake( data_dir=gin.query_parameter( 'triggered_earthquake_dataset.data_dir'), testing_quakes=[], downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE, mode=DatasetMode.INFERENCE, transform=triggered_earthquake_transform(random_trim_offset=False), ) loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True) classifier_alldata = create_classifier(model, loader, type='svc', device=device) with open( os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)), 'wb') as f: pickle.dump(classifier_alldata, f) @trainer.on(Events.COMPLETED) def save_metadata(_): ''' save a metadata file, used for inference :param _: :return: ''' transformer = triggered_earthquake_transform(random_trim_offset=False) transformer_path = os.path.join(model_dir, 'transformer.p') pickle.dump(transformer, open(transformer_path, 'wb')) metadata = { 'name': run_name, 'classes': gin.query_parameter('triggered_earthquake_dataset.labels'), 'model_state_path': save_handler.last_checkpoint, 'classifier_path': os.path.join(model_dir, '{}_classifier.p'.format(prefix)), 'embedding_size': embedding_size, 'num_layers': num_layers, 'transformer': transformer_path } with open(os.path.join(model_dir, 'metadata.json'), 'w') as f: json.dump(metadata, f) trainer.run(train_loader, max_epochs=epochs) writer.close()
def inference(experiment_path, earthquake_path, labels=None): ts = datetime.now().strftime("%m_%d_%Y__%H_%M") result_name = '{}_{}'.format(earthquake_path.split('/')[-1], ts) # load metadata with open(os.path.join(experiment_path, 'metadata.json'), 'r') as f: metadata = json.load(f) model_path = metadata['model_state_path'] classifier_path = metadata['classifier_path'] embedding_size = metadata['embedding_size'] num_layers = metadata['num_layers'] transformer_path = metadata['transformer'] # load the model state = torch.load(model_path) model = DilatedConvolutional(embedding_size=embedding_size, num_layers=num_layers) model.load_state_dict(state) # load the classifier classifier = pickle.load(open(classifier_path, 'rb')) # run through each example in the earthquake path transformer = pickle.load(open(transformer_path, 'rb')) dataset = TriggeredEarthquake(data_dir=earthquake_path, downloadable_data=None, mode=DatasetMode.INFERENCE, testing_quakes=[], labels=labels, transform=transformer) result_csv_path = os.path.join(experiment_path, '{}_results.csv'.format(result_name)) headers = ['quake', 'name', 'given_label', 'classification'] writer = csv.DictWriter(open(result_csv_path, 'w'), fieldnames=headers) writer.writeheader() device = torch.device('cuda' if torch.cuda else 'cpu') model.to(device) embeddings = [] for obs in dataset.processed_files: processed = torch.load(obs) data = processed['data'] label = processed['label'] quake = processed['quake'] file_name = processed['file_name'] embedding = model(data.view( -1, 1, data.shape[-1]).to(device)).detach().cpu().numpy() embeddings.append(embeddings) classification = labels[np.argmax(classifier.predict(embedding))] writer.writerow({ 'quake': quake, 'name': file_name, 'given_label': label, 'classification': classification })