def partialcase_to_fm_format(caseid, partialcase, cid_list, \ activity_list, negative_samples, seed, \ normalize, pred_id): x_datalist = list() x_row_inds = list() x_col_inds = list() x_shape = np.zeros(shape=(2, )) y_datalist = list() pred_id_list = list() if partialcase.shape[0] <= 2: # not enough events to make a step return np.asarray(x_datalist), np.asarray(x_row_inds), \ np.asarray(x_col_inds), np.asarray(x_shape), \ np.asarray(y_datalist), np.asarray(pred_id_list) # there should negative samples + 1 rows if negative_samples >= 0 and \ negative_samples < activity_list.shape[0]: num_of_rows = negative_samples + 1 else: num_of_rows = activity_list.shape[0] x_shape[0] = num_of_rows x_shape[1] = cid_list.shape[0] + activity_list.shape[0] * 2 x_shape = x_shape.astype(np.int) # make into 2-steps subpartialcase = partialcase[:-1] to_predict = partialcase[-1] possible = activity_list # pick negative samples if negative_samples > -1: rand_negatives = filter(lambda s: s != to_predict, possible) # need to check if negative_samples is larger than len(rand_negatives) rand_negatives = list(rand_negatives) random_sz = negative_samples if negative_samples > len(rand_negatives): # use the size of rand_negatives if this is bigger random_sz = len(rand_negatives) rand_negatives = np.random.choice(list(rand_negatives), \ size=random_sz, \ replace=False) samples = np.append(rand_negatives, [ to_predict, ]) else: samples = filter(lambda s: s != to_predict, possible) samples = np.append(list(samples), [ to_predict, ]) # create block for cid cid_repeat = np.asarray([ caseid, ]).repeat(len(samples)) cid_datalist, cid_row_inds, cid_col_inds, cid_shape = \ rutils.single_to_fm_format(cid_repeat, cid_list) # create block for taken acts taken_repeat = np.asarray([ subpartialcase, ]).repeat(len(samples), axis=0) taken_datalist, taken_row_inds, taken_col_inds, taken_shape = \ rutils.multiple_to_fm_format(taken_repeat, activity_list) # create block for samples # print('Samples: {}'.format(samples)) next_datalist, next_row_inds, next_col_inds, next_shape = \ rutils.single_to_fm_format(samples, activity_list) # shift taken columns by |cid_list| taken_col_inds = taken_col_inds + cid_list.shape[0] # shift next columns by |cid_list| + |activity_list| next_col_inds = next_col_inds + cid_list.shape[0] + activity_list.shape[0] x_datalist = np.concatenate((cid_datalist, taken_datalist, next_datalist)) x_row_inds = np.concatenate((cid_row_inds, taken_row_inds, next_row_inds)) x_col_inds = np.concatenate((cid_col_inds, taken_col_inds, next_col_inds)) x_row_inds = x_row_inds.astype(np.int) x_col_inds = x_col_inds.astype(np.int) y_datalist = np.asarray([np.int(step) for step in samples == to_predict], \ dtype=np.int) pred_id_list = np.ones(len(y_datalist)) * pred_id return x_datalist, x_row_inds, x_col_inds, x_shape, \ y_datalist, pred_id_list
def variant_to_fm_format(steps, step_list, activity_list, \ rev_step_mapping, minpartialsz=2,\ negative_samples=3, seed=123, \ normalize=True, pred_id=0): ''' Method to make matrix representation of step case. Matrix will contain the following features: - steps taken - executed activities - step to be predicted ''' # create objects to be returned x_datalist = list() x_row_inds = list() x_col_inds = list() x_shape = np.zeros(shape=(2,)) y_datalist = list() # list of pred_id to make later identification faster pred_id_list = list() # base step, steps is shorter or equal to minimum partial size, # return it since no prediction to be made if steps.shape[0] <= minpartialsz: return np.asarray(x_datalist), np.asarray(x_row_inds), \ np.asarray(x_col_inds), x_shape, \ np.asarray(y_datalist), np.asarray(pred_id_list) for ind in range(minpartialsz, steps.shape[0]): partialcase = steps[:ind] laststep = steps[ind - 1] # actual next step gt_next_step = steps[ind] # get the next activity and predict for it gt_next_act = rev_step_mapping[gt_next_step][-1] # check if it is bound with other things get_act = lambda act: act.split('+')[0] gt_next_act = get_act(gt_next_act) # sample negative samples if negative_samples > -1: np.random.seed(seed=seed) random_negative_samples = \ list(filter(lambda act: act != gt_next_act, activity_list)) picked_inds = np.random.choice(np.arange(len(random_negative_samples)), \ size=negative_samples, \ replace=False) random_negative_samples = list(map(lambda ind: \ random_negative_samples[ind], \ picked_inds)) samples = np.append(random_negative_samples, [gt_next_act,]) else: # select all the possible next steps samples = list(filter(lambda act: act != gt_next_act, activity_list)) samples = np.append(samples, [gt_next_act,]) # create the taken steps part taken_repeat = np.asarray([partialcase for _ in range(len(samples))]) taken_datalist, taken_row_inds, taken_col_inds, taken_shape = \ rutils.multiple_to_fm_format(taken_repeat, step_list, normalize) # create the taken activities part t_acts = list(map(lambda step: rev_step_mapping[step], partialcase)) t_acts = list(map(lambda step: get_act(step[-1]), t_acts)) t_acts = np.asarray(t_acts) # need to add the first activity of the first step if ARTIFICIAL_START not in t_acts: t_acts = np.append([ARTIFICIAL_START,], t_acts) # print('t_acts: {}'.format(t_acts)) not_in = list(filter(lambda act: act not in activity_list, t_acts)) assert len(not_in) == 0, 't_acts not in activity list: \ {} with {} items.'.format(str(not_in), len(not_in)) t_acts_repeat = np.asarray([t_acts for _ in range(len(samples))]) t_acts_datalist, t_acts_row_inds, t_acts_col_inds, t_acts_shape = \ rutils.multiple_to_fm_format(t_acts_repeat, activity_list, normalize) # create the last executed activity part ''' l_act = rev_step_mapping[laststep][-1] assert l_act in activity_list, '{} not in activity list: {}'\ .format(l_act, activity_list) l_act_repeat = np.asarray([l_act,]).repeat(len(samples)) l_act_datalist, l_act_row_inds, l_act_col_inds, l_act_shape = \ rutils.single_to_fm_format(l_act_repeat, activity_list) ''' # samples next_datalist, next_row_inds, next_col_inds, next_shape = \ rutils.single_to_fm_format(samples, activity_list) # create the matrix representation of step case in fm format # by combining all the above info # check dimensions assert (taken_shape[0] == next_shape[0]) and \ (next_shape[0] == t_acts_shape[0]), \ 'taken shape: {}, \ next shape: {}, t_acts shape: {}' \ .format(taken_shape, next_shape, \ t_acts_shape) assert taken_shape[1] == len(step_list), \ 'taken shape: {}, step list shape: {}'\ .format(taken_shape, len(step_list)) assert next_shape[1] == len(activity_list), \ 'next shape: {}, activity list shape: {}'\ .format(next_shape, len(activity_list)) assert t_acts_shape[1] == len(activity_list), \ 'taken acts shape: {}, activity list shape: {}'\ .format(t_acts_shape, len(activity_list)) # shift taken activities columns by |step_list| t_acts_col_inds = t_acts_col_inds + len(step_list) # shift next columns by |step_list| + 2 * |activity_list| next_col_inds = next_col_inds + len(step_list) + \ len(activity_list) x_datalist_i = np.concatenate((taken_datalist, t_acts_datalist, \ next_datalist)) x_row_inds_i = np.concatenate((taken_row_inds, t_acts_row_inds, \ next_row_inds)) x_col_inds_i = np.concatenate((taken_col_inds, t_acts_col_inds, \ next_col_inds)) num_of_cols = len(step_list) + 2 * activity_list.shape[0] x_shape_i = np.asarray((len(samples), num_of_cols)) # print('creating x shape: {}'.format(x_shape)) # create the target y datalist, putting 1 for the next step # and putting 0 for the negative samples y_datalist_i = np.asarray([np.int(act) for act in samples == \ gt_next_act]) assert Counter(y_datalist_i)[1] == 1, \ 'y_datalist_i: {}'.format(y_datalist_i) # print('y_datalist: {}'.format(y_datalist)) # pred_id_list, should all be the same prediction number pred_id_list_i = np.ones(len(y_datalist_i)) * pred_id # check the dimensions make sense # column should be the same if len(x_datalist) > 0: assert x_shape[1] == x_shape_i[1], \ 'x_shape: {} not equal x_shape_i: {}'\ .format(x_shape, x_shape_i) # shift rows of x_row_inds1 by number of existing rows x_row_inds_i = x_row_inds_i + x_shape[0] x_datalist = np.concatenate((x_datalist, x_datalist_i)) x_row_inds = np.concatenate((x_row_inds, x_row_inds_i)) x_col_inds = np.concatenate((x_col_inds, x_col_inds_i)) x_shape = np.asarray((x_shape[0] + x_shape_i[0], x_shape[1])) y_datalist = np.concatenate((y_datalist, y_datalist_i)) pred_id_list = np.concatenate((pred_id_list, pred_id_list_i)) else: x_datalist = x_datalist_i x_row_inds = x_row_inds_i x_col_inds = x_col_inds_i x_shape = x_shape_i y_datalist = y_datalist_i pred_id_list = pred_id_list_i pred_id += 1 return x_datalist, x_row_inds, x_col_inds, x_shape, y_datalist, \ pred_id_list
def step_case_to_fm_format(caseid, steps, caseid_list, step_list,\ next_step_mapping, minpartialsz=2,\ negative_samples=3, seed=123, \ normalize=True, pred_id=0): # create objects to be returned x_datalist = list() x_row_inds = list() x_col_inds = list() x_shape = np.zeros(shape=(2,)) y_datalist = list() # list of pred_id to make later identification faster pred_id_list = list() # base step, steps is shorter or equal to minimum partial size, # return it since no prediction to be made if steps.shape[0] <= minpartialsz: return np.asarray(x_datalist), np.asarray(x_row_inds), \ np.asarray(x_col_inds), x_shape, \ np.asarray(y_datalist), np.asarray(pred_id_list) for ind in range(minpartialsz, steps.shape[0]): partialcase = steps[:ind] laststep = steps[ind - 1] possible_next_step_list = next_step_mapping[laststep] assert len(possible_next_step_list) > 0 # actual next step gt_next_step = steps[ind] # sample negative samples if negative_samples > -1: np.random.seed(seed=seed) random_negative_samples = \ list(filter(lambda step: step != gt_next_step, \ possible_next_step_list)) picked_inds = np.random.choice(np.arange(len(random_negative_samples)), \ size=negative_samples, \ replace=False) random_negative_samples = list(map(lambda ind: \ random_negative_samples[ind], \ picked_inds)) samples = np.append(random_negative_samples, [gt_next_step,]) else: # select all the possible next steps samples = list(filter(lambda step: step != gt_next_step, \ possible_next_step_list)) samples = np.append(samples, [gt_next_step,]) # repeat caseids for |samples| times cid_repeat = np.asarray([caseid,]).repeat(len(samples)) cid_datalist, cid_row_inds, cid_col_inds, cid_shape = \ rutils.single_to_fm_format(cid_repeat, caseid_list) # create the taken steps part taken_repeat = np.asarray([partialcase for _ in range(len(samples))]) taken_datalist, taken_row_inds, taken_col_inds, taken_shape = \ rutils.multiple_to_fm_format(taken_repeat, step_list, normalize) # samples next_datalist, next_row_inds, next_col_inds, next_shape = \ rutils.single_to_fm_format(samples, step_list) # create the matrix representation of step case in fm format # by combining all the above info # check dimensions assert (cid_shape[0] == taken_shape[0]) and \ (taken_shape[0] == next_shape[0]), \ 'cid shape: {}, taken shape: {}, next shape: {}'\ .format(cid_shape, taken_shape, next_shape) assert cid_shape[1] == len(caseid_list), \ 'cid shape: {}, taken shape: {}, next shape: {}'\ .format(cid_shape, taken_shape, next_shape) assert taken_shape[1] == len(step_list), \ 'taken shape: {}, step list shape: {}'\ .format(taken_shape, len(step_list)) assert next_shape[1] == len(step_list), \ 'next shape: {}, step list shape: {}'\ .format(next_shape, len(step_list)) # shift taken columns by |caseid_list| taken_col_inds = taken_col_inds + len(caseid_list) # shift next columns by |caseid_list| + |step_list| next_col_inds = next_col_inds + len(caseid_list) + len(step_list) x_datalist_i = np.concatenate((cid_datalist, taken_datalist, \ next_datalist)) x_row_inds_i = np.concatenate((cid_row_inds, taken_row_inds, \ next_row_inds)) x_col_inds_i = np.concatenate((cid_col_inds, taken_col_inds, \ next_col_inds)) x_shape_i = np.asarray((len(samples), len(caseid_list) + \ 2 * len(step_list))) # print('creating x shape: {}'.format(x_shape)) # create the target y datalist, putting 1 for the next step # and putting 0 for the negative samples y_datalist_i = np.asarray([np.int(step) for step in samples == gt_next_step]) # print('y_datalist: {}'.format(y_datalist)) # pred_id_list, should all be the same prediction number pred_id_list_i = np.ones(len(y_datalist_i)) * pred_id # append to current results and shift results if needed if len(x_datalist) > 0: assert x_shape[1] == x_shape_i[1], \ 'x_shape: {} not equal x_shape_i: {}'\ .format(x_shape, x_shape_i) # shift rows of x_row_inds_i by number of existing rows x_row_inds_i = x_row_inds_i + x_shape[0] x_datalist = np.concatenate((x_datalist, x_datalist_i)) x_row_inds = np.concatenate((x_row_inds, x_row_inds_i)) x_col_inds = np.concatenate((x_col_inds, x_col_inds_i)) x_shape = np.asarray((x_shape[0] + x_shape_i[0], x_shape[1])) y_datalist = np.concatenate((y_datalist, y_datalist_i)) pred_id_list = np.concatenate((pred_id_list, pred_id_list_i)) else: x_datalist = x_datalist_i x_row_inds = x_row_inds_i x_col_inds = x_col_inds_i x_shape = x_shape_i y_datalist = y_datalist_i pred_id_list = pred_id_list_i # increment pred id pred_id += 1 return x_datalist, x_row_inds, x_col_inds, x_shape, y_datalist, \ pred_id_list