Esempio n. 1
0
def test_classification(model, testing_quakes, device, data_dir):
    """
    test model on testing set
    :param model: trained pytorch model
    :param testing_quakes: Array of quakes to test
    :param device: pytorch device
    :param data_dir: direcory of triggered earthquake data
    :return: accurarcy, confusion matrix string, classification model
    """
    ds_train = TriggeredEarthquake(
        data_dir=data_dir,
        testing_quakes=testing_quakes,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
        mode=DatasetMode.INFERENCE,
        transform=triggered_earthquake_transform(random_trim_offset=False),
    )
    ds_test = TriggeredEarthquake(
        data_dir=data_dir,
        testing_quakes=testing_quakes,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
        mode=DatasetMode.TEST,
        transform=triggered_earthquake_transform(random_trim_offset=False))
    train_loader = DataLoader(ds_train,
                              batch_size=1,
                              num_workers=10,
                              shuffle=True)
    test_loader = DataLoader(ds_test,
                             batch_size=1,
                             num_workers=10,
                             shuffle=True)

    svc = create_classifier(model, train_loader, type='svc', device=device)
    acc, cm = report_accurarcy(model, svc, test_loader, device=device)

    return acc, cm, svc
Esempio n. 2
0
    def test_test_mode(self):
        data_path = os.path.expanduser('~/.seisml/data/' +
                                       DownloadableData.SAMPLE_DATA.value)
        accepted_labels = ['positive', 'negative', 'chaos']
        testing_quakes = ['eq01']
        mode = DatasetMode.TEST
        train_ds = TriggeredEarthquake(
            data_dir=data_path,
            force_download=False,
            labels=accepted_labels,
            downloadable_data=DownloadableData.SAMPLE_DATA,
            mode=mode,
            testing_quakes=testing_quakes)
        for file in os.listdir(
                os.path.join(data_path, 'prepare_{}'.format(mode.value))):
            data = torch.load(
                os.path.join(data_path, 'prepare_{}'.format(mode.value), file))
            assert data[
                'quake'] in testing_quakes, 'should only contain testing quake'

        total_raw_count = 0
        for l in accepted_labels:
            for q in testing_quakes:
                count = len(os.listdir(os.path.join(data_path, 'raw', q, l)))
                total_raw_count += count

        ds_count = 0
        for _ in train_ds:
            ds_count += 1

        assert total_raw_count == ds_count, 'dataset should contain all examples'
    def test_siamese_learning(self):
        embedding_size = 10

        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser('~/.seisml/data/' +
                                        DownloadableData.SAMPLE_DATA.value),
            force_download=False,
            downloadable_data=DownloadableData.SAMPLE_DATA)
        ds = SiameseDataset(ds)

        dl = DataLoader(ds, batch_size=24, num_workers=1, shuffle=True)

        model = DilatedConvolutional(embedding_size=embedding_size,
                                     downsample=False)
        test_data, test_label = next(iter(dl))
        params = filter(lambda p: p.requires_grad, model.parameters())
        opt = torch.optim.Adam(params, lr=0.01)
        l = DeepClusteringLoss()

        test_data = test_data.view(-1, 1, test_data.shape[-1])
        embedding_a = model(test_data)
        assert len(embedding_a[-1]
                   ) == embedding_size, 'output should match embedding size'
        _loss_a = l(embedding_a, test_label.float())

        for _ in range(4):
            for data, label in dl:
                data = data.view(-1, 1, data.shape[-1])
                output = model(data)
                _loss = l(output, label.float())
                _loss.backward()
                opt.step()

        embedding_b = model(test_data)
        _loss_b = l(embedding_b, test_label.float())
Esempio n. 4
0
    def test_download_and_preproces(self):
        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser('~/.seisml/data/' +
                                        DownloadableData.SAMPLE_DATA.value),
            force_download=False,
            downloadable_data=DownloadableData.SAMPLE_DATA)

        assert len(ds) > 0, 'files should exist'
        assert os.path.isdir(
            os.path.expanduser(
                '~/.seisml/data/' +
                DownloadableData.SAMPLE_DATA.value)), 'data should exist'
Esempio n. 5
0
    def test_get_item(self):
        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser('~/.seisml/data/' +
                                        DownloadableData.SAMPLE_DATA.value),
            force_download=False,
            downloadable_data=DownloadableData.SAMPLE_DATA)

        dl = DataLoader(ds, batch_size=1, num_workers=1)
        sample = next(iter(dl))

        assert isinstance(sample[0],
                          torch.Tensor), 'data ouput should be tensor'
        assert np.sum(sample[1].numpy()
                      ) == 1, 'one-hot encoding should contain exactly 1 class'
    def test_paper_configuration(self):
        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser('~/.seisml/data/' +
                                        DownloadableData.SAMPLE_DATA.value),
            force_download=False,
            downloadable_data=DownloadableData.SAMPLE_DATA)

        dl = DataLoader(ds, batch_size=1, num_workers=1, shuffle=True)

        model = DilatedConvolutional(embedding_size=10)
        data, label = next(iter(dl))
        summary(model, data.unsqueeze(1))

        num_params = sum(
            [p.numel() for p in model.parameters() if p.requires_grad])
        assert num_params == 7440, 'number of params should match papers description'
Esempio n. 7
0
    def train_dataset(self):
        transform = triggered_earthquake_transform(sampling_rate=20.0,
                                                   max_freq=8.0,
                                                   min_freq=2.0,
                                                   corner=2,
                                                   aug_types=None,
                                                   aug_prob=0.5,
                                                   target_length=8192,
                                                   random_trim_offset=True)

        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser(
                '~/.seisml/data/triggered_earthquakes'),
            force_download=False,
            downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
            labels=['positive', 'negative'],
            mode=DatasetMode.TRAIN,
            testing_quakes=['SAC_20021102_XF_prem'],
            transform=transform)
        return ds
Esempio n. 8
0
    def test_dataset_wrapping(self):
        ds = TriggeredEarthquake(
            data_dir=os.path.expanduser('~/.seisml/data/' + DownloadableData.SAMPLE_DATA.value),
            force_download=False,
            downloadable_data=DownloadableData.SAMPLE_DATA
        )

        batch_size = 4

        ds = SiameseDataset(ds)
        dl = DataLoader(ds, batch_size=batch_size, num_workers=1)
        data, label = next(iter(dl))

        data_batch = data.shape[0]
        label_batch = label.shape[0]
        data_channel = data.shape[1]
        label_channel = label.shape[1]

        assert data_batch == batch_size, 'data batch size should match'
        assert label_batch == batch_size, 'label batch size should match'
        assert data_channel == label_channel == 2, 'should have two channels'
        assert not np.allclose(data[0][0], data[0][1]), 'paired data points should not equal'
        assert not np.allclose(label[0][0], label[0][1]), 'paired labels should be different classifications'
Esempio n. 9
0
    def save_classifier(_):
        '''
        create and save two svc classifiers in the model_dir
            - one with only training data
            - ont with all data
        :param _:
        :return:
        '''
        # save classifier only trained on training data
        _, _, classifier = test_classification(
            model,
            gin.query_parameter('triggered_earthquake_dataset.testing_quakes'),
            device,
            gin.query_parameter('triggered_earthquake_dataset.data_dir'))
        with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)),
                  'wb') as f:
            pickle.dump(classifier, f)

        # save classifier trained on all data (for running inference)
        ds = TriggeredEarthquake(
            data_dir=gin.query_parameter(
                'triggered_earthquake_dataset.data_dir'),
            testing_quakes=[],
            downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
            mode=DatasetMode.INFERENCE,
            transform=triggered_earthquake_transform(random_trim_offset=False),
        )
        loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True)
        classifier_alldata = create_classifier(model,
                                               loader,
                                               type='svc',
                                               device=device)
        with open(
                os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)),
                'wb') as f:
            pickle.dump(classifier_alldata, f)
Esempio n. 10
0
def train(prefix, epochs, batch_size, num_workers, embedding_size, num_layers,
          learning_rate, weight_decay, model_dir, run_dir):
    ts = datetime.now().strftime("%m_%d_%Y__%H_%M")
    run_name = '{}_{}'.format(prefix, ts)

    model_dir = os.path.join(model_dir, run_name)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    writer = SummaryWriter(os.path.join(run_dir, run_name))

    ds_train = TriggeredEarthquake(
        mode=DatasetMode.TRAIN,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE)

    ds_test = TriggeredEarthquake(
        mode=DatasetMode.TEST,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
        transform=triggered_earthquake_transform(random_trim_offset=False))
    # ds_train = SiameseDataset(ds_train)
    train_loader = DataLoader(ds_train,
                              batch_size=batch_size,
                              num_workers=num_workers,
                              shuffle=True)
    test_loader = DataLoader(ds_test,
                             batch_size=batch_size,
                             num_workers=num_workers,
                             shuffle=True)

    model = DilatedConvolutional(embedding_size=embedding_size,
                                 num_layers=num_layers)
    params = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = torch.optim.Adam(params,
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    loss_fn = DeepClusteringLoss()

    trainer = create_engine(model, optimizer, loss_fn, device)
    evaluator = create_eval(model, {'dcl': Loss(loss_fn)}, device)

    summary(
        model,
        (1,
         gin.query_parameter('triggered_earthquake_transform.target_length')))
    writer.add_graph(model,
                     next(iter(train_loader))[0].unsqueeze(1).to(device))

    save_handler = ModelCheckpoint(model_dir,
                                   prefix,
                                   n_saved=1,
                                   create_dir=True,
                                   require_empty=False)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_handler,
                              {'model': model})

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(_):
        """
        report training loss
        :param _:
        :return:
        """
        writer.add_scalar('Iter/train_loss', trainer.state.output,
                          trainer.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(_):
        """
        report training loss
        :param _:
        :return:
        """
        evaluator.run(train_loader)
        loss = trainer.state.output
        writer.add_scalar('Loss/train', loss, trainer.state.epoch)
        print("Training Results - Epoch: {} Avg loss: {:.2f}".format(
            trainer.state.epoch, trainer.state.output))

    @trainer.on(Events.EPOCH_COMPLETED)
    def test_acc(_):
        """
        report testing accurarcy
        :param _:
        :return:
        """
        acc, cm, _, = test_classification(
            model,
            gin.query_parameter('triggered_earthquake_dataset.testing_quakes'),
            device,
            gin.query_parameter('triggered_earthquake_dataset.data_dir'))
        writer.add_scalar('Accurarcy/test', acc, trainer.state.epoch)
        print('Testing Accurarcy: {:.2f}'.format(acc))
        print(cm)

    def report_embeddings(_):
        """
        write embeddings to tensorboard
        :param _:
        :return:
        """
        train_loader = DataLoader(ds_train, batch_size=1)
        test_loader = DataLoader(ds_test, batch_size=1)

        text_labels = gin.query_parameter(
            'triggered_earthquake_dataset.labels')
        train_embeddings, train_labels = get_embeddings(model,
                                                        train_loader,
                                                        device=device)
        train_labels = [
            text_labels[np.argmax(l)] for l in train_labels.squeeze(1)
        ]
        writer.add_embedding(train_embeddings.squeeze(1),
                             metadata=train_labels,
                             global_step=trainer.state.epoch,
                             tag='train_embeddings')

        test_embeddings, test_labels = get_embeddings(model,
                                                      test_loader,
                                                      device=device)
        test_labels = [
            text_labels[np.argmax(l)] for l in test_labels.squeeze(1)
        ]
        writer.add_embedding(test_embeddings.squeeze(1),
                             metadata=test_labels,
                             global_step=trainer.state.epoch,
                             tag='test_embeddings')

    trainer.add_event_handler(Events.EPOCH_COMPLETED(once=1),
                              report_embeddings)
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=5),
                              report_embeddings)

    @trainer.on(Events.COMPLETED)
    def save_classifier(_):
        '''
        create and save two svc classifiers in the model_dir
            - one with only training data
            - ont with all data
        :param _:
        :return:
        '''
        # save classifier only trained on training data
        _, _, classifier = test_classification(
            model,
            gin.query_parameter('triggered_earthquake_dataset.testing_quakes'),
            device,
            gin.query_parameter('triggered_earthquake_dataset.data_dir'))
        with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)),
                  'wb') as f:
            pickle.dump(classifier, f)

        # save classifier trained on all data (for running inference)
        ds = TriggeredEarthquake(
            data_dir=gin.query_parameter(
                'triggered_earthquake_dataset.data_dir'),
            testing_quakes=[],
            downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
            mode=DatasetMode.INFERENCE,
            transform=triggered_earthquake_transform(random_trim_offset=False),
        )
        loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True)
        classifier_alldata = create_classifier(model,
                                               loader,
                                               type='svc',
                                               device=device)
        with open(
                os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)),
                'wb') as f:
            pickle.dump(classifier_alldata, f)

    @trainer.on(Events.COMPLETED)
    def save_metadata(_):
        '''
        save a metadata file, used for inference
        :param _:
        :return:
        '''
        transformer = triggered_earthquake_transform(random_trim_offset=False)
        transformer_path = os.path.join(model_dir, 'transformer.p')
        pickle.dump(transformer, open(transformer_path, 'wb'))

        metadata = {
            'name':
            run_name,
            'classes':
            gin.query_parameter('triggered_earthquake_dataset.labels'),
            'model_state_path':
            save_handler.last_checkpoint,
            'classifier_path':
            os.path.join(model_dir, '{}_classifier.p'.format(prefix)),
            'embedding_size':
            embedding_size,
            'num_layers':
            num_layers,
            'transformer':
            transformer_path
        }

        with open(os.path.join(model_dir, 'metadata.json'), 'w') as f:
            json.dump(metadata, f)

    trainer.run(train_loader, max_epochs=epochs)
    writer.close()
Esempio n. 11
0
def inference(experiment_path, earthquake_path, labels=None):

    ts = datetime.now().strftime("%m_%d_%Y__%H_%M")
    result_name = '{}_{}'.format(earthquake_path.split('/')[-1], ts)

    # load metadata
    with open(os.path.join(experiment_path, 'metadata.json'), 'r') as f:
        metadata = json.load(f)

    model_path = metadata['model_state_path']
    classifier_path = metadata['classifier_path']
    embedding_size = metadata['embedding_size']
    num_layers = metadata['num_layers']
    transformer_path = metadata['transformer']

    # load the model
    state = torch.load(model_path)
    model = DilatedConvolutional(embedding_size=embedding_size,
                                 num_layers=num_layers)
    model.load_state_dict(state)

    # load the classifier
    classifier = pickle.load(open(classifier_path, 'rb'))

    # run through each example in the earthquake path
    transformer = pickle.load(open(transformer_path, 'rb'))
    dataset = TriggeredEarthquake(data_dir=earthquake_path,
                                  downloadable_data=None,
                                  mode=DatasetMode.INFERENCE,
                                  testing_quakes=[],
                                  labels=labels,
                                  transform=transformer)

    result_csv_path = os.path.join(experiment_path,
                                   '{}_results.csv'.format(result_name))
    headers = ['quake', 'name', 'given_label', 'classification']
    writer = csv.DictWriter(open(result_csv_path, 'w'), fieldnames=headers)
    writer.writeheader()

    device = torch.device('cuda' if torch.cuda else 'cpu')
    model.to(device)

    embeddings = []
    for obs in dataset.processed_files:
        processed = torch.load(obs)
        data = processed['data']
        label = processed['label']
        quake = processed['quake']
        file_name = processed['file_name']

        embedding = model(data.view(
            -1, 1, data.shape[-1]).to(device)).detach().cpu().numpy()
        embeddings.append(embeddings)

        classification = labels[np.argmax(classifier.predict(embedding))]
        writer.writerow({
            'quake': quake,
            'name': file_name,
            'given_label': label,
            'classification': classification
        })