Example #1
0
    def __init__(self,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 aws_bucket_name='cvae-insights',
                 local_data_store=False,
                 deployment_prefix='dev',
                 model_version='2019-01-03'):
        """Create an instance of GetData."""
        self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '')
        self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                    '')
        self.github_token = os.environ.get('GITHUB_TOKEN', '')
        self.bucket_name = aws_bucket_name
        self.deployment_prefix = deployment_prefix
        self.version_name = model_version
        if local_data_store:
            self.s3_client = LocalDataStore('tests/test_data')
        else:
            self.s3_object = AmazonS3(
                bucket_name=self.bucket_name,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key)
            self.s3_client = self.load_S3()

        self.utility = Utility()
 def __init__(self, *args, **kwargs):
     """Initialise the local data store and HPF object."""
     super(TestHPFScoringMethods, self).__init__(*args, **kwargs)
     self.local_obj = LocalDataStore("tests/test_data")
     self.hpf_obj = HPFScoring(self.local_obj)
     self.hpf_obj_feedback = HPFScoring(self.local_obj)
     self.package_id_dict = OrderedDict()
     self.id_package_dict = OrderedDict()
     self.manifest_id_dict = OrderedDict()
Example #3
0
def local_data_store(request, tmp_dir):

    data_store = LocalDataStore(tmp_dir)

    data_path = Path(__file__).resolve().parents[1]
    test_dir_path = data_path.joinpath("data").absolute()
    dir_util.copy_tree(test_dir_path, tmp_dir)

    def teardown():
        nonlocal data_store
        del data_store

    request.addfinalizer(teardown)
    return data_store
Example #4
0
class TestPMFRecommendation(TestCase):
    """Test the core recommendations task."""
    def setUp(self):
        """Instantiate the resources required for the tests."""
        self.fs = LocalDataStore('tests/test_data')
        self.assertTrue(self.fs.get_name().endswith('tests/test_data'))
        self.pmf_rec = PMFRecommendation(2, data_store=self.fs, num_latent=5)

    def test__find_closest_user_in_training_set(self):
        """Test if we are getting correct "closest user" from the training set."""
        # Full match
        closest = self.pmf_rec._find_closest_user_in_training_set(
            [17190, 14774, 15406, 16594, 29063])
        self.assertIsNotNone(closest)
        # Partial
        closest = self.pmf_rec._find_closest_user_in_training_set(
            [17190, 14774, 15406])
        self.assertIsNotNone(closest)
        # Negative
        closest = self.pmf_rec._find_closest_user_in_training_set([3, 4])
        self.assertIsNone(closest)

    def test__sigmoid(self):
        """Test if the sigmoid function is behaving correctly."""
        self.assertEqual(self.pmf_rec._sigmoid(0), 0.5)

    def test_predict(self):
        """Test the prediction flow."""
        # Test for a new stack.
        missing, recommendation, ptm = self.pmf_rec.predict(['pon-logger'])
        self.assertFalse(missing)
        # Should have two recommendations here.
        self.assertEqual(len(recommendation), 2)

        # Tests for missing package.
        missing, recommendation, _ = self.pmf_rec.predict(
            ['pon-logger', 'missing'])
        self.assertTrue(missing)
        # Test if still getting recommendation as no. of missing = no. of known
        self.assertGreater(len(recommendation), 0)

        missing, _, package_tag_map = self.pmf_rec.predict(['missing'])
        self.assertDictEqual(package_tag_map, {})

        # Test for precomputed stack.
        _, recommendation, _ = self.pmf_rec.predict(
            ['async', 'colors', 'request', 'underscore', 'pkginfo'])
        self.assertTrue(recommendation)
Example #5
0
        bucket_name=cloud_constants.S3_BUCKET_NAME,  # pragma: no cover
        aws_access_key_id=cloud_constants.AWS_S3_ACCESS_KEY_ID,
        aws_secret_access_key=cloud_constants.AWS_S3_SECRET_KEY_ID)
    s3.connect()
elif LOCAL_ACCESS:
    print("INSIDE LOCAL ACCESS")
    s3 = AmazonS3(bucket_name=cloud_constants.S3_BUCKET_NAME,
                  aws_access_key_id=cloud_constants.AWS_S3_ACCESS_KEY_ID,
                  aws_secret_access_key=cloud_constants.AWS_S3_SECRET_KEY_ID,
                  endpoint_url=cloud_constants.AWS_S3_ENDPOINT_URL,
                  local_dev=True)
    s3.connect()
else:
    from rudra.data_store.local_data_store import LocalDataStore
    # Change the source directory here for local file system testing.
    s3 = LocalDataStore('tests/test_data/')
    ScoringParams.num_latent_factors = 5

# This needs to be global as ~200MB of data is loaded from S3 every time an object of this class
# is instantiated.
recommender = PMFRecommendation(ScoringParams.recommendation_threshold, s3,
                                ScoringParams.num_latent_factors)


@app.get('/api/v1/liveness', status_code=200)
def liveness():
    """Define the linveness probe."""
    return {}


@app.get('/api/v1/readiness', status_code=200)
SENTRY_DSN = os.environ.get("SENTRY_DSN", "")
sentry = Sentry(app, dsn=SENTRY_DSN, logging=True, level=logging.ERROR)
app.logger.info('App initialized, ready to roll...')

if cloud_constants.USE_CLOUD_SERVICES:
    s3_client = AmazonS3(
        bucket_name=cloud_constants.S3_BUCKET_NAME,
        aws_access_key_id=cloud_constants.AWS_S3_ACCESS_KEY_ID,
        aws_secret_access_key=cloud_constants.AWS_S3_SECRET_KEY_ID)
    s3_client.connect()

else:
    from rudra.data_store.local_data_store import LocalDataStore

    # Change the source directory here for local file system testing.
    s3_client = LocalDataStore('tests/test_data')

recommender = HPFScoring(num_recommendations=10, data_store=s3_client)

daiquiri.setup(level=os.environ.get('FLASK_LOGGING_LEVEL', logging.INFO))
_logger = daiquiri.getLogger(__name__)


@app.route('/api/v1/liveness', methods=['GET'])
def liveness():
    """Define the liveness probe."""
    return flask.jsonify({}), 200


@app.route('/api/v1/readiness', methods=['GET'])
def readiness():
 def test_load_rating(self):
     """Test the load_rating method."""
     path = 'test_load_rating.txt'
     test_datastore = LocalDataStore('tests/test_data')
     r = load_rating(path, test_datastore)
     self.assertListEqual(r, [[5409, 2309, 54909, 2054], list()])
Example #8
0
 def __init__(self, *args, **kwargs):
     """Initialise the local data store and HPF object."""
     super(TestHPFScoringMethods, self).__init__(*args, **kwargs)
     self.local_obj = LocalDataStore("tests/test_data")
     self.hpf_obj = HPFScoring(self.local_obj)
     self.hpf_obj_feedback = HPFScoring(self.local_obj)
setup_logging(app.app)
CORS(app.app)

global scoring_status
global scoring_object
global s3_object

if HPF_SCORING_REGION != "":
    if convert_string2bool_env(USE_CLOUD_SERVICES):
        s3_object = AmazonS3(bucket_name=AWS_S3_BUCKET_NAME,
                             aws_access_key_id=AWS_S3_ACCESS_KEY_ID,
                             aws_secret_access_key=AWS_S3_SECRET_ACCESS_KEY)
        s3_object.connect()
        app.scoring_object = HPFScoring(datastore=s3_object)
    else:
        app.scoring_object = HPFScoring(LocalDataStore("tests/test_data"))
    app.scoring_status = True
else:
    app.scoring_status = False
    current_app.logger.warning("Have not loaded a model for scoring!")


def heart_beat():
    """Handle the / REST API call."""
    return flask.jsonify({"status": "ok"})


def liveness():
    """Define the linveness probe."""
    return flask.jsonify({"status": "alive"})
Example #10
0
class GetData:
    """This class defines the S3 Connections viz fetching and storing data."""
    def __init__(self,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 aws_bucket_name='cvae-insights',
                 local_data_store=False,
                 deployment_prefix='dev',
                 model_version='2019-01-03'):
        """Create an instance of GetData."""
        self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '')
        self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                    '')
        self.github_token = os.environ.get('GITHUB_TOKEN', '')
        self.bucket_name = aws_bucket_name
        self.deployment_prefix = deployment_prefix
        self.version_name = model_version
        if local_data_store:
            self.s3_client = LocalDataStore('tests/test_data')
        else:
            self.s3_object = AmazonS3(
                bucket_name=self.bucket_name,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key)
            self.s3_client = self.load_S3()

        self.utility = Utility()

    def load_S3(self):
        """Establish the connection with S3."""
        self.s3_object.connect()
        if self.s3_object.is_connected():
            logger.info("S3 connection established.")
            return self.s3_object
        else:
            raise Exception

    def load_raw_data(self):
        """Load the raw data from S3 bucket."""
        NPM_raw_data_path = os.path.join(self.version_name,
                                         "data/manifest.json")
        try:
            raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path)
            logger.info("Size of Raw Manifest file is: {}".format(
                len(raw_data_dict_)))
            return raw_data_dict_
        except Exception:
            raise Exception

    def load_existing_data(self):
        """Load the node registry dump from S3 bucket."""
        NPM_clean_json_data_path = os.path.join(
            self.version_name, "data/node-package-details-with-url.json")
        try:
            logger.info("Path Existed")
            existing_data = self.s3_client.read_generic_file(
                NPM_clean_json_data_path)
            existing_df = self.utility.read_json_file(existing_data)
            logger.info("Size of Raw df with url is: {}".format(
                len(existing_df)))
            return existing_df
        except Exception:
            raise Exception("S3 connection error")
 def setUp(self):
     """Instantiate the resources required for the tests."""
     self.fs = LocalDataStore('tests/test_data')
     self.assertTrue(self.fs.get_name().endswith('tests/test_data'))
     self.pmf_rec = PMFRecommendation(2, data_store=self.fs, num_latent=50)
"""Contains the path constants for S3 and local storage."""
import os
from rudra.data_store.local_data_store import LocalDataStore
num_users = os.environ.get("TRAIN_PER_USER", 5)

MODEL_VERSION = os.environ.get('MODEL_VERSION', '2019-01-03')
TEMPORARY_PATH = '/tmp/trained-model/'
TEMPORARY_DATA_PATH = '/tmp/data/'
TEMPORARY_MODEL_PATH = '/tmp/intermediate-model/cvae-model/'
TEMPORARY_DATASTORE = LocalDataStore('/tmp')
TEMPORARY_LATENT_PATH = os.path.join(TEMPORARY_MODEL_PATH,
                                     'latent_pretrain_all')
TEMPORARY_SDAE_PATH = os.path.join(TEMPORARY_MODEL_PATH, 'train')
TEMPORARY_CVAE_PATH = os.path.join(TEMPORARY_MODEL_PATH, 'cvae')
TEMPORARY_PMF_PATH = os.path.join(TEMPORARY_MODEL_PATH, 'pmf-packagedata.mat')
TEMPORARY_USER_ITEM_FILEPATH = os.path.join(
    TEMPORARY_DATA_PATH, "packagedata-train-" + str(num_users) + "-users.dat")
TEMPORARY_ITEM_USER_FILEPATH = os.path.join(
    TEMPORARY_DATA_PATH, "packagedata-train-" + str(num_users) + "-items.dat")
USER_ITEM_FILEPATH = os.path.join(
    MODEL_VERSION, 'data',
    "packagedata-train-" + str(num_users) + "-users.dat")
ITEM_USER_FILEPATH = os.path.join(
    MODEL_VERSION, 'data',
    "packagedata-train-" + str(num_users) + "-items.dat")
PRECOMPUTED_MANIFEST_PATH = os.path.join(MODEL_VERSION,
                                         "data/manifest_user_data.dat")
PMF_MODEL_PATH = os.path.join(MODEL_VERSION, 'intermediate-model/cvae-model',
                              'pmf-packagedata.mat')
PACKAGE_TO_ID_MAP = os.path.join(MODEL_VERSION,
                                 'trained-model/package_to_index_map.json')
class TestHPFScoringMethods(unittest.TestCase):
    """Test functionalities of hpf scoring."""
    def __init__(self, *args, **kwargs):
        """Initialise the local data store and HPF object."""
        super(TestHPFScoringMethods, self).__init__(*args, **kwargs)
        self.local_obj = LocalDataStore("tests/test_data")
        self.hpf_obj = HPFScoring(self.local_obj)
        self.hpf_obj_feedback = HPFScoring(self.local_obj)
        self.package_id_dict = OrderedDict()
        self.id_package_dict = OrderedDict()
        self.manifest_id_dict = OrderedDict()

    def test_basic_object(self):
        """Test basic HPF object."""
        assert self.hpf_obj is not None
        assert self.hpf_obj.recommender is not None
        assert self.hpf_obj.m is not None

    # Currently we are not moving forward with this, but in future will look
    # on it. So commented.
    # def test_match_feedback_manifest(self):
    #     """Test match feedback manifest with dummy ids."""
    #     input_id_set = {1}
    #     id_ = self.hpf_obj_feedback.match_feedback_manifest(input_id_set)
    #     assert int(id_) == -1
    #     input_id_set = {64, 200, 66, 44}
    #     id_ = self.hpf_obj_feedback.match_feedback_manifest(input_id_set)
    #     assert int(id_) == 0
    #     id_ = self.hpf_obj.match_feedback_manifest(input_id_set)
    #     assert int(id_) == -1

    def test_load_objects(self):
        """Test logic where incoming data is correct or not."""
        self.package_id_dict = self.local_obj.read_json_file(
            HPF_output_package_id_dict)
        self.id_package_dict = OrderedDict({
            x: n
            for n, x in self.package_id_dict.get("package_list", {}).items()
        })
        self.package_id_dict = OrderedDict(
            self.package_id_dict.get("package_list", {}))
        self.manifest_id_dict = self.local_obj.read_json_file(
            HPF_output_manifest_id_dict)
        self.manifest_id_dict = OrderedDict({
            n: set(x)
            for n, x in self.manifest_id_dict.get("manifest_list", {}).items()
        })
        self.assertTrue(isinstance(self.package_id_dict, dict))
        self.assertTrue(isinstance(self.id_package_dict, dict))
        self.assertTrue(isinstance(self.manifest_id_dict, dict))

    def test_recommend_known_user(self):
        """Test logic where we recommend for a known user(exists in training set)."""
        recommendation, user_id = self.hpf_obj.recommend_known_user(0)
        assert recommendation is not None
        assert user_id is not None

    def test_recommend_new_user(self):
        """Test the fold-in logic where we calculate factors for new user."""
        recommendation, user_id = self.hpf_obj.recommend_new_user([0])
        assert recommendation is not None
        assert user_id is not None

    def test_predict_missing(self):
        """Test no prediction in case of higher than threshold missing package ratio."""
        with app.app.app_context():
            recommendation = self.hpf_obj.predict(['missing-pkg'])
            self.assertFalse(recommendation[0])
            self.assertTrue(recommendation[2])

    def test_model_details(self):
        """Test the basic model details function."""
        details = "The model will be scored against 12405 Packages, 9523 Manifests."

        print(self.hpf_obj.model_details())
        assert self.hpf_obj.model_details() == details

    #     # assert self.hpf_obj_feedback.model_details() == details

    def test_get_sizeof(self):
        """Test static _getsizeof method."""
        int_value = 1
        int_size = 2.6702880859375e-05
        assert HPFScoring._getsizeof(int_value) == "{} MB".format(int_size)