def session_aggr(sf, cols, key="session_code"): mean_operations = {("%s_mean" % col): agg.MEAN(col) for col in cols} std_operations = {("%s_std" % col): agg.STD(col) for col in cols} min_operations = {("%s_min" % col): agg.MIN(col) for col in cols} max_operations = {("%s_max" % col): agg.MAX(col) for col in cols} all_operations = {} all_operations.update(mean_operations) all_operations.update(std_operations) all_operations.update(min_operations) all_operations.update(max_operations) return sf.groupby(key_column_names=[key], operations=all_operations)
def get_venue_authors_timeseries(self): p = self._all_papers_sf["Paper ID", "Paper publish year"] a = self.authors_affilations_sframe["Paper ID", "Author ID"] sf = p.join(a, on="Paper ID")["Author ID", "Paper publish year"] sf = sf.groupby( "Author ID", { "mindate": agg.MIN("Paper publish year"), "maxdate": agg.MAX("Paper publish year") }) sf.rename({"Author ID": "v_id"}) sf["mindate"] = sf["mindate"].apply( lambda y: datetime(year=y, month=1, day=1)) sf["maxdate"] = sf["maxdate"].apply( lambda y: datetime(year=y, month=1, day=1)) if sf.num_rows() == 0: return None return tc.TimeSeries(sf, index="mindate")
def predict(self, dataset, output_type='class', output_frequency='per_row'): """ Return predictions for ``dataset``, using the trained activity classifier. Predictions can be generated as class labels, or as a probability vector with probabilities for each class. The activity classifier generates a single prediction for each ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the number of predictions is smaller than the length of ``dataset``. By default each prediction is replicated by ``prediction_window`` to return a prediction for each row of ``dataset``. Use ``output_frequency`` to get the unreplicated predictions. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'class', 'probability_vector'}, optional Form of each prediction which is one of: - 'probability_vector': Prediction probability associated with each class as a vector. The probability of the first class (sorted alphanumerically by name of the class in the training set) is in position 0 of the vector, the second in position 1 and so on. - 'class': Class prediction. This returns the class with maximum probability. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. - 'per_row': Convenience option to make sure the number of predictions match the number of rows in the dataset. Each prediction from the model is repeated ``prediction_window`` times during that window. Returns ------- out : SArray | SFrame If ``output_frequency`` is 'per_row' return an SArray with predictions for each row in ``dataset``. If ``output_frequency`` is 'per_window' return an SFrame with predictions for ``prediction_window`` rows in ``dataset``. See Also ---------- create, evaluate, classify Examples -------- .. sourcecode:: python # One prediction per row >>> probability_predictions = model.predict( ... data, output_type='probability_vector', output_frequency='per_row')[:4] >>> probability_predictions dtype: array Rows: 4 [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])] # One prediction per window >>> class_predictions = model.predict( ... data, output_type='class', output_frequency='per_window') >>> class_predictions +---------------+------------+-----+ | prediction_id | session_id |class| +---------------+------------+-----+ | 0 | 3 | 5 | | 1 | 3 | 5 | | 2 | 3 | 5 | | 3 | 3 | 5 | | 4 | 3 | 5 | | 5 | 3 | 5 | | 6 | 3 | 5 | | 7 | 3 | 4 | | 8 | 3 | 4 | | 9 | 3 | 4 | | ... | ... | ... | +---------------+------------+-----+ """ _tkutl._raise_error_if_not_sframe(dataset, 'dataset') _tkutl._check_categorical_option_type( 'output_frequency', output_frequency, ['per_window', 'per_row']) _tkutl._check_categorical_option_type( 'output_type', output_type, ['probability_vector', 'class']) from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._sframe_sequence_iterator import _ceil_dev prediction_window = self.prediction_window chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window, self._predictions_in_chunk, verbose=False) data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features), prediction_window, self._predictions_in_chunk, self._recalibrated_batch_size, use_pad=True) chunked_data = data_iter.dataset preds = self._pred_model.predict(data_iter).asnumpy() if output_frequency == 'per_row': # Replicate each prediction times prediction_window preds = preds.repeat(prediction_window, axis=1) # Remove predictions for padded rows unpadded_len = chunked_data['chunk_len'].to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] # Reshape from (num_of_chunks, chunk_size, num_of_classes) # to (ceil(length / prediction_window), num_of_classes) # chunk_size is DIFFERENT between chunks - since padding was removed. out = _np.concatenate(preds) out = out.reshape((-1, len(self._target_id_map))) out = _SArray(out) if output_type == 'class': id_target_map = self._id_target_map out = out.apply(lambda c: id_target_map[_np.argmax(c)]) elif output_frequency == 'per_window': # Calculate the number of expected predictions and # remove predictions for padded data unpadded_len = chunked_data['chunk_len'].apply( lambda l: _ceil_dev(l, prediction_window)).to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] out = _SFrame({ self.session_id: chunked_data['session_id'], 'preds': _SArray(preds, dtype=list) }).stack('preds', new_column_name='probability_vector') # Calculate the prediction index per session out = out.add_row_number(column_name='prediction_id') start_sess_idx = out.groupby( self.session_id, {'start_idx': _agg.MIN('prediction_id')}) start_sess_idx = start_sess_idx.unstack( [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0] if output_type == 'class': id_target_map = self._id_target_map out['probability_vector'] = out['probability_vector'].apply( lambda c: id_target_map[_np.argmax(c)]) out = out.rename({'probability_vector': 'class'}) return out
'Paper ID', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list r_sf.save('/data/sframes/PapersRefCount.sframe') r_sf = r_sf[r_sf['Ref Count'] >= 5] # left with 22,083,058 p_sf = gl.load_sframe("./Papers.sframe/") # 126,903,970 rows p_sf = r_sf.join(p_sf) # 22,082,741 p_sf.save('./PapersMin5Ref.sframe') p_sf = gl.load_sframe('./PapersMin5Ref.sframe') a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/') # 337000127 sf = p_sf[['Paper ID']].join(a_sf) # 86,561,861 rows sf = sf.join(p_sf, on="Paper ID") sf.groupby( "Author ID", { 'Papers Count': agg.COUNT_DISTINCT('Paper ID'), 'start_year': agg.MIN('Paper publish year'), 'last_year': agg.MAX('Paper publish year'), 'mean_ref_count': agg.AVG('Ref Count'), 'papers_list': agg.CONCAT('Paper ID'), 'journals_list': agg.CONCAT('Journal ID mapped to venue name'), 'conference_list': agg.CONCAT('Conference ID mapped to venue name'), 'affilation_list': agg.CONCAT('Affiliation ID') }) sf = gl.SFrame() r = re.compile(r"\d{4}") for i in l: try: y = r.findall(i)[0] x = gl.SFrame.read_csv("%s/%s" % (p, i)) x['Year'] = y
max_operations = {("%s_max" % col): agg.MAX(col) for col in cols} all_operations = {} all_operations.update(mean_operations) all_operations.update(std_operations) all_operations.update(min_operations) all_operations.update(max_operations) return sf.groupby(key_column_names=[key], operations=all_operations) session_stats = session_aggr(session_data, agg_cols_total) print("## vi.) Session heterosity") ops = { "num_uniq_tracks": agg.COUNT_DISTINCT("track_code"), "session_length": agg.MIN("session_length") } uniq_info = session_data.groupby("session_code", operations=ops) uniq_info["track_heterogenity"] = uniq_info["num_uniq_tracks"] / uniq_info[ "session_length"] uniq_info["track_repetition"] = uniq_info["session_length"] - uniq_info[ "num_uniq_tracks"] uniq_info = uniq_info.remove_column("session_length") session_stats = session_stats.join(uniq_info, on="session_code") del uniq_info session_stats.save("%s/sess_stats" % folder, format='binary') print("## vii.) Separate first and second half of the playlist") session_data["position_over_length"] = session_data[
if col in m_col: cols_for_total_aggr.append(m_col) break print("cols for total aggregations:", cols_for_total_aggr) agg_total_operations = {} for col in cols_for_total_aggr: parts = col.split("_") feat = "_".join(parts[:-1]) if parts[-1] == "mean" and feat != "dist_from_sess": agg_total_operations[col] = agg.MEAN(feat) elif parts[-1] == "std": agg_total_operations[col] = agg.STD(feat) elif parts[-1] == "min": agg_total_operations[col] = agg.MIN(feat) elif parts[-1] == "max": agg_total_operations[col] = agg.MAX(feat) else: print(col) continue session_stats = session_data.groupby(key_column_names=["session_code"], operations=agg_total_operations) print("## vi.) Session heterosity") ops = { "num_uniq_tracks": agg.COUNT_DISTINCT("track_code"), "session_length": agg.MIN("session_length") } uniq_info = session_data.groupby("session_code", operations=ops)