Beispiel #1
0
def sample_metta_csv_diff_order(directory):
    """Stores matrix and metadata in a metta-data-like form

    The train and test matrices will have different column orders

    Args:
        directory (str)
    """
    train_dict = OrderedDict([
        ('entity_id', [1, 2]),
        ('k_feature', [0.5, 0.4]),
        ('m_feature', [0.4, 0.5]),
        ('label', [0, 1])
    ])
    train_matrix = pandas.DataFrame.from_dict(train_dict)
    train_metadata = {
        'feature_start_time': datetime.date(2014, 1, 1),
        'end_time': datetime.date(2015, 1, 1),
        'matrix_id': 'train_matrix',
        'label_name': 'label',
        'label_timespan': '3month',
        'indices': ['entity_id'],
    }

    test_dict = OrderedDict([
        ('entity_id', [3, 4]),
        ('m_feature', [0.4, 0.5]),
        ('k_feature', [0.5, 0.4]),
        ('label', [0, 1])
    ])

    test_matrix = pandas.DataFrame.from_dict(test_dict)
    test_metadata = {
        'feature_start_time': datetime.date(2015, 1, 1),
        'end_time': datetime.date(2016, 1, 1),
        'matrix_id': 'test_matrix',
        'label_name': 'label',
        'label_timespan': '3month',
        'indices': ['entity_id'],
    }

    train_uuid, test_uuid = metta.archive_train_test(
        train_config=train_metadata,
        df_train=train_matrix,
        test_config=test_metadata,
        df_test=test_matrix,
        directory=directory,
        format='csv'
    )

    train_store = CSVMatrixStore(
        matrix_path=os.path.join(directory, '{}.csv'.format(train_uuid)),
        metadata_path=os.path.join(directory, '{}.yaml'.format(train_uuid))
    )
    test_store = CSVMatrixStore(
        matrix_path=os.path.join(directory, '{}.csv'.format(test_uuid)),
        metadata_path=os.path.join(directory, '{}.yaml'.format(test_uuid))
    )
    return train_store, test_store
Beispiel #2
0
    def test_as_of_dates_entity_index(self):
        data = {
            "entity_id": [1, 2],
            "feature_one": [0.5, 0.6],
            "feature_two": [0.5, 0.6],
        }
        with tempfile.TemporaryDirectory() as tmpdir:
            project_storage = ProjectStorage(tmpdir)
            matrix_store = CSVMatrixStore(project_storage, [], "test")
            matrix_store.matrix = pd.DataFrame.from_dict(data)
            matrix_store.metadata = {"end_time": "2016-01-01", "indices": ["entity_id"]}

            self.assertEqual(matrix_store.as_of_dates, ["2016-01-01"])
Beispiel #3
0
def test_as_of_dates_entity_index(project_storage):
    data = {
        "entity_id": [1, 2],
        "feature_one": [0.5, 0.6],
        "feature_two": [0.5, 0.6],
        "label": [0, 1],
    }
    df = pd.DataFrame.from_dict(data)
    labels = df.pop("label")
    matrix_store = CSVMatrixStore(project_storage, [], "test")
    matrix_store.matrix_label_tuple = df, labels
    matrix_store.metadata = {"end_time": "2016-01-01", "indices": ["entity_id"], "label_name": "label"}

    assert matrix_store.as_of_dates == ["2016-01-01"]
Beispiel #4
0
def test_as_of_dates(project_storage):
    data = {
        "entity_id": [1, 2, 1, 2],
        "feature_one": [0.5, 0.6, 0.5, 0.6],
        "feature_two": [0.5, 0.6, 0.5, 0.6],
        "as_of_date": [
            pd.Timestamp(2016, 1, 1),
            pd.Timestamp(2016, 1, 1),
            pd.Timestamp(2017, 1, 1),
            pd.Timestamp(2017, 1, 1),
        ],
        "label": [1, 0, 1, 0],
    }
    df = pd.DataFrame.from_dict(data)
    matrix_store = CSVMatrixStore(
        project_storage,
        [],
        "test",
        matrix=df,
        metadata={
            "indices": ["entity_id", "as_of_date"],
            "label_name": "label"
        },
    )
    assert matrix_store.as_of_dates == [
        datetime.date(2016, 1, 1),
        datetime.date(2017, 1, 1),
    ]
Beispiel #5
0
    def test_as_of_dates_entity_index(self):
        data = {
            'entity_id': [1, 2],
            'feature_one': [0.5, 0.6],
            'feature_two': [0.5, 0.6],
        }
        with tempfile.TemporaryDirectory() as tmpdir:
            project_storage = ProjectStorage(tmpdir)
            matrix_store = CSVMatrixStore(project_storage, [], 'test')
            matrix_store.matrix = pd.DataFrame.from_dict(data)
            matrix_store.metadata = {
                'end_time': '2016-01-01',
                'indices': ['entity_id']
            }

            self.assertEqual(matrix_store.as_of_dates, ['2016-01-01'])
Beispiel #6
0
def matrix_stores():
    df = pd.DataFrame.from_dict(DATA_DICT).set_index(MatrixStore.indices)

    with tempfile.TemporaryDirectory() as tmpdir:
        project_storage = ProjectStorage(tmpdir)
        tmpcsv = os.path.join(tmpdir, "df.csv.gz")
        tmpyaml = os.path.join(tmpdir, "df.yaml")
        with open(tmpyaml, "w") as outfile:
            yaml.dump(METADATA, outfile, default_flow_style=False)
        df.to_csv(tmpcsv, compression="gzip")
        csv = CSVMatrixStore(project_storage, [], "df")
        # first test with caching
        with csv.cache():
            yield csv
        # with the caching out of scope they will be nuked
        # and this last version will not have any cache
        yield csv
Beispiel #7
0
    def matrix_store(self):
        data_dict = OrderedDict([('entity_id', [1, 2]),
                                 ('k_feature', [0.5, 0.4]),
                                 ('m_feature', [0.4, 0.5]), ('label', [0, 1])])
        df = pd.DataFrame.from_dict(data_dict)
        metadata = {
            'label_name': 'label',
            'indices': ['entity_id'],
        }

        inmemory = InMemoryMatrixStore(matrix=df, metadata=metadata)

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpcsv = os.path.join(tmpdir, 'df.csv')
            tmpyaml = os.path.join(tmpdir, 'metadata.yaml')
            tmphdf = os.path.join(tmpdir, 'df.h5')
            with open(tmpyaml, 'w') as outfile:
                yaml.dump(metadata, outfile, default_flow_style=False)
                df.to_csv(tmpcsv)
                df.to_hdf(tmphdf, 'matrix')
                csv = CSVMatrixStore(matrix_path=tmpcsv, metadata_path=tmpyaml)
                hdf = HDFMatrixStore(matrix_path=tmphdf, metadata_path=tmpyaml)

                assert csv.matrix.to_dict() == inmemory.matrix.to_dict()
                assert hdf.matrix.to_dict() == inmemory.matrix.to_dict()

                assert csv.metadata == inmemory.metadata
                assert hdf.metadata == inmemory.metadata

                assert csv.head_of_matrix.to_dict(
                ) == inmemory.head_of_matrix.to_dict()
                assert hdf.head_of_matrix.to_dict(
                ) == inmemory.head_of_matrix.to_dict()

                assert csv.empty == inmemory.empty
                assert hdf.empty == inmemory.empty

                assert csv.labels().to_dict() == inmemory.labels().to_dict()
                assert hdf.labels().to_dict() == inmemory.labels().to_dict()

        matrix_store = [inmemory, csv, hdf]

        return matrix_store
Beispiel #8
0
def matrix_stores():
    df = pd.DataFrame.from_dict(DATA_DICT).set_index(["entity_id"])

    with tempfile.TemporaryDirectory() as tmpdir:
        project_storage = ProjectStorage(tmpdir)
        tmpcsv = os.path.join(tmpdir, "df.csv")
        tmpyaml = os.path.join(tmpdir, "df.yaml")
        tmphdf = os.path.join(tmpdir, "df.h5")
        with open(tmpyaml, "w") as outfile:
            yaml.dump(METADATA, outfile, default_flow_style=False)
        df.to_csv(tmpcsv)
        df.to_hdf(tmphdf, "matrix")
        csv = CSVMatrixStore(project_storage, [], "df")
        hdf = HDFMatrixStore(project_storage, [], "df")
        assert csv.design_matrix.equals(hdf.design_matrix)
        # first test with caching
        with csv.cache(), hdf.cache():
            yield csv
            yield hdf
        # with the caching out of scope they will be nuked
        # and these last two versions will not have any cache
        yield csv
        yield hdf
Beispiel #9
0
    def matrix_stores(self):
        df = pd.DataFrame.from_dict(self.data_dict).set_index(["entity_id"])

        with tempfile.TemporaryDirectory() as tmpdir:
            project_storage = ProjectStorage(tmpdir)
            tmpcsv = os.path.join(tmpdir, "df.csv")
            tmpyaml = os.path.join(tmpdir, "df.yaml")
            tmphdf = os.path.join(tmpdir, "df.h5")
            with open(tmpyaml, "w") as outfile:
                yaml.dump(self.metadata, outfile, default_flow_style=False)
                df.to_csv(tmpcsv)
                df.to_hdf(tmphdf, "matrix")
                csv = CSVMatrixStore(project_storage, [], "df")
                hdf = HDFMatrixStore(project_storage, [], "df")
                assert csv.matrix.equals(hdf.matrix)
                yield from [csv, hdf]
Beispiel #10
0
def test_as_of_dates_entity_date_index(project_storage):
    data = {
        "entity_id": [1, 2, 1, 2],
        "feature_one": [0.5, 0.6, 0.5, 0.6],
        "feature_two": [0.5, 0.6, 0.5, 0.6],
        "as_of_date": ["2016-01-01", "2016-01-01", "2017-01-01", "2017-01-01"],
        "label": [1, 0, 1, 0]
    }
    df = pd.DataFrame.from_dict(data)
    matrix_store = CSVMatrixStore(
        project_storage,
        [],
        "test",
        matrix=df,
        metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"}
    )
    assert matrix_store.as_of_dates == ["2016-01-01", "2017-01-01"]
Beispiel #11
0
    def test_s3_save(self):
        with mock_s3():

            client = boto3.client("s3")
            client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write")
            example = next(self.matrix_stores())
            project_storage = ProjectStorage("s3://fake-matrix-bucket")

            tosave = CSVMatrixStore(project_storage, [], "test")
            tosave.matrix = example.matrix
            tosave.metadata = example.metadata
            tosave.save()

            tocheck = CSVMatrixStore(project_storage, [], "test")
            assert tocheck.metadata == example.metadata
            assert tocheck.matrix.to_dict() == example.matrix.to_dict()
Beispiel #12
0
def test_s3_save():
    with mock_s3():
        client = boto3.client("s3")
        client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write")
        for example in matrix_stores():
            if not isinstance(example, CSVMatrixStore):
                continue
            project_storage = ProjectStorage("s3://fake-matrix-bucket")

            tosave = CSVMatrixStore(project_storage, [], "test")
            tosave.metadata = example.metadata
            tosave.matrix_label_tuple = example.matrix_label_tuple
            tosave.save()

            tocheck = CSVMatrixStore(project_storage, [], "test")
            assert tocheck.metadata == example.metadata
            assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict()
Beispiel #13
0
    def test_s3_save(self):
        with mock_s3():
            import boto3
            client = boto3.client('s3')
            client.create_bucket(Bucket='fake-matrix-bucket',
                                 ACL='public-read-write')
            example = next(self.matrix_stores())
            project_storage = ProjectStorage('s3://fake-matrix-bucket')

            tosave = CSVMatrixStore(project_storage, [], 'test')
            tosave.matrix = example.matrix
            tosave.metadata = example.metadata
            tosave.save()

            tocheck = CSVMatrixStore(project_storage, [], 'test')
            assert tocheck.metadata == example.metadata
            assert tocheck.matrix.to_dict() == example.matrix.to_dict()
Beispiel #14
0
    def test_s3_save(self):
        with mock_s3():
            import boto3
            client = boto3.client('s3')
            client.create_bucket(Bucket='fake-matrix-bucket',
                                 ACL='public-read-write')

            matrix_store_list = self.matrix_store()

            for matrix_store in matrix_store_list:
                if isinstance(matrix_store, CSVMatrixStore):
                    matrix_store.save(project_path='s3://fake-matrix-bucket',
                                      name='test')
                    # CSV
                    csv = CSVMatrixStore(
                        matrix_path='s3://fake-matrix-bucket/test.csv',
                        metadata_path='s3://fake-matrix-bucket/test.yaml')

                    assert csv.metadata == matrix_store_list[0].metadata
                    assert csv.matrix.to_dict(
                    ) == matrix_store_list[0].matrix.to_dict()