def __init__(self, vec_name, data_path=DATA_PATH, out_path=DATA_PATH):
     """
     Loads stepframe form file and checks for correct format
     :param step_name: filename
     :param data_path: optional, if none utils.storage.DATA_PATH will be used
     """
     super().__init__()
     self.vec_name = vec_name
     self.data_path = data_path
     self.out_path = out_path
     vecframe = load_frame(vec_name, data_path)
     check_if_vecframe(vecframe)
     self.vecframe = vecframe
    def dump_vecframe(self, vf, appendix=None, in_csv=False):
        """
        Dumps resulting vectorframe
        :param appendix: added to the filename from which Compressor was initialized
        :return: name of the file created
        """
        check_if_vecframe(vf)
        if appendix == None:
            outname = self.out_name
        else:
            outname = self.vec_name + "_" + appendix

        dump_frame(vf, outname, self.out_path, in_csv=in_csv)
        return outname
Example #3
0
    def __init__(self, vf_fname, time_dim=0, in_datapath=DATA_PATH):
        """
        Loads vecframe containing the compressed embeddings from file
        :param step_name: filename
        :param data_path: optional, if none, default from utils.storage.load_stepframe will be used
        """
        super().__init__()

        self.vf_fname = vf_fname

        self.in_datapath = in_datapath

        if not time_dim:
            vecframe = load_frame(vf_fname, in_datapath)
            check_if_vecframe(vecframe)
            self.vecframe = vecframe

        else:
            self.vecframe = load_frame_as_3d_nparray(vf_fname,
                                                     data_path=in_datapath)
def make_weekly_vecframe(step_name, vec_name='{}_week', data_path=DATA_PATH):
    '''
    Transforms a stepframe into a vecframe without splittling the data.
    'desc' will always be 0.

    :param step_name: name of the stepframe
    :param vec_name: name under which vecframe will be saved
    :param data_path: optional, path to data folder
    :return:
    '''
    stepframe = load_frame(step_name, DATA_PATH)
    vecframe = stepframe.loc[:, '0':].transpose()
    vecframe.columns = [str(col) for col in vecframe.columns]
    vecframe['user'] = vecframe.index
    vecframe['user'] = vecframe['user'].apply(int)
    vecframe['desc'] = [0] * vecframe.shape[0]
    cols = list(vecframe.columns)
    vecframe = vecframe[cols[-2:] + cols[:-2]]
    # vecframe = vecframe.reset_index(drop=True)
    check_if_vecframe(vecframe)
    dump_frame(vecframe, vec_name.format(step_name), data_path)
def daySplitter(step_name, data_path=DATA_PATH):
    """
    Splits entries into days and saves results as vecframe.
    """

    stepframe = load_frame(step_name, data_path)
    check_if_stepframe(stepframe)
    vec_len = stepframe.loc[stepframe.day == 0].shape[0]
    columns = ['user', 'desc'] + list(range(vec_len))
    vfs = []
    for day in stepframe.day.unique():
        vf = stepframe[stepframe.day == day].iloc[:,
                                                  4:999 + 4].T.astype('int32')
        vf.columns = list(range(vec_len))
        vf['user'] = vf.index.to_numpy(dtype=pd.np.int)
        vf['desc'] = day
        vfs.append(vf)
    vecframe = pd.concat(vfs, sort=False, ignore_index=True)
    vecframe = vecframe[columns]
    vecframe.columns = vecframe.columns.astype(str)
    check_if_vecframe(vecframe)
    dump_frame(vecframe, '{}_dsp'.format(step_name))
        [seq_len, 1, 1])
    out, emb = model(row_tens)

    emb = torch.squeeze(emb)

    embArr.append(head + (emb.data.tolist()))
    outArr.append(head + (out.data.tolist()))

emb_df = pd.DataFrame(data=embArr,
                      columns=['user', 'desc'] +
                      [str(i) for i in range(len(embArr[0]) - 2)])
out_df = pd.DataFrame(data=outArr,
                      columns=['user', 'desc'] +
                      [str(i) for i in range(len(outArr[0]) - 2)])

print(out_df.shape, emb_df.shape)

print(out_df.head(3))
print(emb_df.head(3))

check_if_vecframe(out_df)
check_if_vecframe(emb_df)

outname = in_dsp_fname + "_LSTM_out_" + str(epochs) + "_" + str(batchsize)
embname = in_dsp_fname + "_LSTM_emb_" + str(epochs) + "_" + str(batchsize)

dump_frame(frame=out_df, name=outname, in_csv=True)
dump_frame(frame=emb_df, name=embname, in_csv=True)

print("written emb and out to disk in ", outname, embname)