def _get_raw_data(self): """ Return the raw lines of the train and test files. """ path = self.__get_raw_path(self.dataset_type) if not os.path.isfile(path): self._download_data(path) if self.dataset_type == MoviLens.DS10K: with zipfile.ZipFile(path) as datafile: return (pd.read_csv(IOReader(datafile.read('ml-100k/ua.base')), sep="\t"), pd.read_csv(IOReader(datafile.read('ml-100k/ua.test')), sep="\t")) else: with zipfile.ZipFile(path) as datafile: data = pd.read_csv(IOReader( datafile.read('ml-20m/ratings.csv')), sep=",") if self.protocol_sorted: # create a order column (order of each rating by user) data['order'] = data.sort_values(by=['timestamp']).groupby( by='userId').cumcount() else: # create a order column (order of each rating by user) data['order'] = data.reindex(np.random.permutation( data.index)).groupby(by='userId').cumcount() #split by users users = data.userId.unique() users_split = len(users) - int(len(users) * self.validation_size) ds_train = data[data.userId.isin(users[:users_split])] ds_test = data[data.userId.isin(users[users_split:])] # given protocol: first N ratings will be using to train if self.protocol > 0: ds_train = pd.concat( (ds_train, ds_test[ds_test.order <= self.protocol]), axis=0) ds_test = ds_test[ds_test.order > self.protocol] # all_but protocol: first N ratings will be using to test (rest to train) else: ds_train = pd.concat( (ds_train, ds_test[ds_test.order >= abs(self.protocol)]), axis=0) ds_test = ds_test[ds_test.order < abs(self.protocol)] del ds_train['order'] del ds_test['order'] return ds_train, ds_test
def _pull_model_and_fingerprint(model_server, model_directory, fingerprint): # type: (EndpointConfig, Text, Optional[Text]) -> Optional[Text] """Queries the model server and returns the value of the response's <ETag> header which contains the model hash.""" header = {"If-None-Match": fingerprint} try: logger.debug("Requesting model from server {}..." "".format(model_server.url)) response = model_server.request(method="GET", headers=header, timeout=DEFAULT_REQUEST_TIMEOUT) except RequestException as e: logger.warning("Tried to fetch model from server, but couldn't reach " "server. We'll retry later... Error: {}." "".format(e)) return None if response.status_code in [204, 304]: logger.debug("Model server returned {} status code, indicating " "that no new model is available. " "Current fingerprint: {}" "".format(response.status_code, fingerprint)) return response.headers.get("ETag") elif response.status_code == 404: logger.debug("Model server didn't find a model for our request. " "Probably no one did train a model for the project " "and tag combination yet.") return None elif response.status_code != 200: logger.warning("Tried to fetch model from server, but server response " "status code is {}. We'll retry later..." "".format(response.status_code)) return None zip_ref = zipfile.ZipFile(IOReader(response.content)) zip_ref.extractall(model_directory) logger.debug("Unzipped model to {}" "".format(os.path.abspath(model_directory))) # get the new fingerprint return response.headers.get("ETag")