Beispiel #1
0
    def _get_raw_data(self):
        """
        Return the raw lines of the train and test files.
        """

        path = self.__get_raw_path(self.dataset_type)

        if not os.path.isfile(path):
            self._download_data(path)

        if self.dataset_type == MoviLens.DS10K:

            with zipfile.ZipFile(path) as datafile:
                return (pd.read_csv(IOReader(datafile.read('ml-100k/ua.base')),
                                    sep="\t"),
                        pd.read_csv(IOReader(datafile.read('ml-100k/ua.test')),
                                    sep="\t"))

        else:
            with zipfile.ZipFile(path) as datafile:
                data = pd.read_csv(IOReader(
                    datafile.read('ml-20m/ratings.csv')),
                                   sep=",")

            if self.protocol_sorted:
                # create a order column (order of each rating by user)
                data['order'] = data.sort_values(by=['timestamp']).groupby(
                    by='userId').cumcount()
            else:
                # create a order column (order of each rating by user)
                data['order'] = data.reindex(np.random.permutation(
                    data.index)).groupby(by='userId').cumcount()

            #split by users
            users = data.userId.unique()
            users_split = len(users) - int(len(users) * self.validation_size)
            ds_train = data[data.userId.isin(users[:users_split])]
            ds_test = data[data.userId.isin(users[users_split:])]

            # given protocol: first N ratings will be using to train
            if self.protocol > 0:
                ds_train = pd.concat(
                    (ds_train, ds_test[ds_test.order <= self.protocol]),
                    axis=0)
                ds_test = ds_test[ds_test.order > self.protocol]

            # all_but protocol: first N ratings will be using to test (rest to train)
            else:
                ds_train = pd.concat(
                    (ds_train, ds_test[ds_test.order >= abs(self.protocol)]),
                    axis=0)
                ds_test = ds_test[ds_test.order < abs(self.protocol)]

            del ds_train['order']
            del ds_test['order']

            return ds_train, ds_test
Beispiel #2
0
def _pull_model_and_fingerprint(model_server, model_directory, fingerprint):
    # type: (EndpointConfig, Text, Optional[Text]) -> Optional[Text]
    """Queries the model server and returns the value of the response's

    <ETag> header which contains the model hash."""
    header = {"If-None-Match": fingerprint}
    try:
        logger.debug("Requesting model from server {}..."
                     "".format(model_server.url))
        response = model_server.request(method="GET",
                                        headers=header,
                                        timeout=DEFAULT_REQUEST_TIMEOUT)
    except RequestException as e:
        logger.warning("Tried to fetch model from server, but couldn't reach "
                       "server. We'll retry later... Error: {}."
                       "".format(e))
        return None

    if response.status_code in [204, 304]:
        logger.debug("Model server returned {} status code, indicating "
                     "that no new model is available. "
                     "Current fingerprint: {}"
                     "".format(response.status_code, fingerprint))
        return response.headers.get("ETag")
    elif response.status_code == 404:
        logger.debug("Model server didn't find a model for our request. "
                     "Probably no one did train a model for the project "
                     "and tag combination yet.")
        return None
    elif response.status_code != 200:
        logger.warning("Tried to fetch model from server, but server response "
                       "status code is {}. We'll retry later..."
                       "".format(response.status_code))
        return None

    zip_ref = zipfile.ZipFile(IOReader(response.content))
    zip_ref.extractall(model_directory)
    logger.debug("Unzipped model to {}"
                 "".format(os.path.abspath(model_directory)))

    # get the new fingerprint
    return response.headers.get("ETag")