Exemple #1
0
    def fill_results_for_id(self, _id, join_dict):
        tic = time.time()
        # Paths to directories
        user_ind_p = _id + "_files/clean_user_ind.pkl"
        sparse_matrix_dot_p = _id + "_files/sparse_matrix_dot_a.pkl"
        # Unpickling objects
        user_ind = unpickle_object(user_ind_p)
        sparse_matrix_dot = unpickle_object(sparse_matrix_dot_p)
        #inv_user_ind = {v:k for k, v in  user_ind.items()}
        sparse_matrix_dot = sparse_matrix_dot  #.tocoo() # Transform to coordinate matrix for this

        # Statistics retrieval
        total_pairs = len(join_dict)
        elements_written = 0
        for i, k in enumerate(join_dict.keys()):
            if i % 1000 == 0:
                self.pprint("[%s] Joining Results" % (_id),
                            "[%d Pairs Processed]" % (i),
                            "[%d Elements Written]" % (elements_written),
                            "[%0.3f Percentage]" % ((i / total_pairs) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            u1, u2 = k  #Extracting users
            if u1 in user_ind and u2 in user_ind:
                elements_written += 1
                join_dict[k][_id] = sparse_matrix_dot[user_ind[u1],
                                                      user_ind[u2]]

        self.pprint("[END] [%s] Joining Results" % (_id),
                    "[%d Pairs Processed]" % (i),
                    "[%d Elements Written]" % (elements_written),
                    "[%0.3f Percentage]" % ((i / total_pairs) * 100),
                    get_ram(), get_elapsed_time(tic))
        return join_dict
Exemple #2
0
def do_extract_trigrams_mp(members):
    dict_trigrams = dict()
    tic = time.time()
    batch_size = 10000
    dirname = 'trigram_files/'
    create_dir(dirname)
    for i in range(0, len(members), batch_size):
        p = mp.Pool(12)
        list_trigrams = p.map(get_trigrams_from_member,
                              members[i:i + batch_size])
        print("Trigrams", "[%d Users Processed]" % (i),
              "[%d Trigrams]" % (len(dict_trigrams)),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))

        for l in list_trigrams:
            for elem in l:
                if elem in dict_trigrams:
                    dict_trigrams[elem] += 1
                else:
                    dict_trigrams[elem] = 1
        p.close()
    print("Saving data...")
    gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"],
                        [(k, v) for k, v in dict_trigrams.items()])
    return dict_trigrams
Exemple #3
0
def do_extract_trigrams(members):
    # This function extracts the trigrams from users. For each user we extract all the trigrams and then we add them to the dictionary of trigrams.
    # The dictionary of trigrams counts all the trigrams that exist and with each new appearance it adds information of how many times it appeared.

    dict_trigrams = dict()
    tic = time.time()
    dirname = 'trigram_files/'
    create_dir(dirname)
    for i, member in enumerate(members):
        if i % 1000 == 0:
            print("[%d Users Processed]" % (i),
                  "[%d Trigrams]" % (len(dict_trigrams)),
                  "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
                  get_elapsed_time(tic))
            if i % 10000 == 0:
                print("Saving data...")
                gen_csv_from_tuples(dirname + "trigrams.csv",
                                    ["trigram", "usage"],
                                    [(k, v) for k, v in dict_trigrams.items()])

        for elem in get_trigrams_from_member(member):
            if elem in dict_trigrams:
                dict_trigrams[elem] += 1
            else:
                dict_trigrams[elem] = 1
    print("Saving data...")
    gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"],
                        [(k, v) for k, v in dict_trigrams.items()])
    return dict_trigrams
Exemple #4
0
def do_extract_users_trigrams_mp(members):
    dict_trigrams = dict()
    tic = time.time()
    batch_size = 20000
    dirname = 'trigram_files/'
    create_dir(dirname)

    # We assign an index to each element in the order they are stored in the file.
    # Additionally, we remove those trigrams written only by one user.
    #dict_trigrams = {x[0]:i for i, x in enumerate(read_csv_list(dirname+"trigrams.csv")[1:]) if int(x[1]) > 1}
    print(len(dict_trigrams))
    # We make the computations in batches of information so that we do not fill the memory.
    # Those batches are stored little by little.
    #partial_function = partial(do_extract_user_trigrams, dict_trigrams)
    for i in range(0, len(members), batch_size):
        print("Users Trigrams", "[%d Users Processed]" % (i),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))
        batch = members[i:i + batch_size]
        p = mp.Pool(16)
        users_trigrams = p.map(do_extract_user_trigrams, batch)
        p.close()
        # We only store lengths bigger than 1
        gen_csv_from_tuples(dirname + "user_to_trigram.csv_%d" % (i),
                            ["IdAuthor", "IdTrigrams"],
                            [x for x in users_trigrams if len(x) > 1])

    #fs = FeatureScore(None, None, None, None, None, None, None,None, None,None, None)
    join_all_results(dirname + "user_to_trigram.csv")
    return dict_trigrams
Exemple #5
0
    def get_information_from_matrix(self, user_ind, sparse_matrix_dot):
        tic = time.time()
        lst_res = []
        inv_user_ind = {v: k for k, v in user_ind.items()}
        num_users = len(user_ind)

        #self.pprint("Transforming Matrix A", end='\r')
        #sparse_matrix_dot = sparse_matrix_dot.tocoo()
        #row, col, data = sparse_matrix_dot.row, sparse_matrix_dot.col, sparse_matrix_dot.data
        #self.pprint("[END] Transforming Matrix A")
        lst_res = []
        tx = sparse_matrix_dot.shape[0]
        print(sparse_matrix_dot.shape)
        for uind in range(tx):
            if uind % 100 == 0:
                self.pprint("Info Extraction",
                            "[%d Users Processed]" % (uind),
                            "[%d List Length]" % (len(lst_res)),
                            "[%0.3f Percentage]" % ((uind / tx) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            row = np.array(sparse_matrix_dot[uind].toarray())
            row = row.flatten()
            row[uind] = 0  # We do not consider the comparison with itself
            rmax = row.max()
            if rmax > 0:
                n = (row == rmax).sum()
                #max_inds = row.argsort()[-n:][::-1]
                max_uinds = np.argpartition(
                    row, -n
                )[-n:]  # it orders "-n" elements of the row, and then, it extracts the last n.

                for i in max_uinds:
                    lst_res.append((inv_user_ind[uind], inv_user_ind[i], rmax))

        lst_res = sorted(lst_res, key=lambda x: x[2], reverse=True)
        pickle_object(lst_res, self.dir + "results.pkl")
        self.pprint("[END] Info Extraction", "[%d Users Processed]" % (uind),
                    "[%d List Length]" % (len(lst_res)),
                    "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(),
                    get_elapsed_time(tic))

        gen_csv_from_tuples(self.dir + "results.csv",
                            ["User1", "User2", "Relation Value"], lst_res)

        return lst_res
Exemple #6
0
 def compute_matrix_mult(self, sparse_matrix):
     tic = time.time()
     sparse_matrix_dot_p = self.dir + 'sparse_matrix_dot_a.pkl'
     print(sparse_matrix.shape)
     self.pprint("Executing coincidence computation over matrix",
                 get_ram(),
                 get_elapsed_time(tic),
                 end='\r')
     sparse_matrix_dot = sparse_matrix.dot(sparse_matrix.T)
     #sparse_matrix_dot = sparse_matrix_dot.astype(dtype='int32')
     sparse_matrix_dot = sparse.triu(sparse_matrix_dot, format='csr')
     print(sparse_matrix_dot.shape)
     sparse_matrix_dot.eliminate_zeros()
     pickle_object(sparse_matrix_dot, sparse_matrix_dot_p)
     self.pprint("[END] Executing coincidence computation over matrix",
                 get_ram(), get_elapsed_time(tic))
     print(sparse_matrix_dot.shape)
     return sparse_matrix_dot
Exemple #7
0
    def gen_sparse_matrix(self, dictio_of_users, dictio_of_usage, num_values):
        tic = time.time()
        sparse_matrix_p = self.dir + 'sparse_matrix.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p)

        if self.backup and os.path.exists(sparse_matrix_p):
            self.pprint("Sparse Matrix already exist, unpickling.", end='\r')
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Sparse Matrix already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix

        num_users = len(dictio_of_users)
        rows = []
        cols = []
        data = []
        for ind, row in enumerate(dictio_of_users.items()):
            if ind % 1000 == 0:
                self.pprint("Sparse Matrix Generation",
                            "[%d Users Processed]" % (ind),
                            "[%0.3f Percentage]" % ((ind / num_users) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            uind, values = row[0], row[1]
            usages = dictio_of_usage[uind]
            for value, usage in zip(values, usages):
                rows.append(uind)
                cols.append(value)
                data.append(usage)

        sparse_matrix = csr_matrix((data, (rows, cols)),
                                   shape=(num_users, num_values),
                                   dtype=self.dtype)
        self.pprint("[END] Sparse Matrix Generation",
                    "[%d Users Processed]" % (ind),
                    "[%0.3f Percentage]" % ((ind / num_users) * 100),
                    get_ram(), get_elapsed_time(tic))
        pickle_object(sparse_matrix, sparse_matrix_p)
        return sparse_matrix
Exemple #8
0
    def calculate_tfidf(self, sparse_matrix):
        tic = time.time()
        #sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values, weight_vector = multfsscore.structurize()
        self.pprint("Computing TF-IDF",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        num_users, num_values = sparse_matrix.shape

        tf = sparse_matrix.multiply(
            1 / sparse_matrix.sum(axis=1))  # User / Total trigrams of user
        idf = np.log(num_users / (sparse_matrix != 0).sum(
            axis=0))  # Num USers / Total that make use of it
        tfidf = tf.multiply(idf)
        print(tfidf.shape)
        tfidf.eliminate_zeros()
        print(tfidf.shape)
        tfidf = tfidf.tocsr()
        self.pprint("[END] Computing TF-IDF", get_ram(), get_elapsed_time(tic))
        return tfidf
Exemple #9
0
def extract_chars_per_user(members):
    dirname = 'num_files/'
    filename = 'user_to_num.csv'
    tic = time.time()
    batch_size = 24000
    create_dir(dirname)
    for i in range(0, len(members), batch_size):
        p = mp.Pool(16)
        lst_pr = p.map(extract_metrics_for_user, members[i:i + batch_size])
        print("Chars per User", "[%d Users Processed]" % (i),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))
        p.close()
        # We only store lengths bigger than 1
        gen_csv_from_tuples(dirname + filename + "_%d" % (i),
                            ["IdAuthor", "IdTrigrams"], lst_pr)

    join_all_results(dirname + filename)
Exemple #10
0
def do_extract_timestamps(members):
    num_users = len(members)
    tic = time.time()
    print("Extracted %d users" % (num_users))
    dirname = 'timestamp_files/'
    create_dir(dirname)
    divs = 100000
    i = 0
    total_size = 0
    while total_size < num_users:
        print("User Timestamps", "[%d Users Processed]" % (total_size),
              "[%0.3f Percentage]" % ((total_size / len(members)) * 100),
              get_ram(), get_elapsed_time(tic))
        extract_timestamps(members[total_size:total_size + divs],
                           dirname + "user_to_timestamp.csv_%d" % (i))
        i += 1
        total_size += divs
        print(i, total_size, time.time() - tic)
    join_all_results(dirname + "user_to_timestamp.csv")
Exemple #11
0
    def compute(self):
        tic = time.time()
        self.pprint("[>>>] Starting with %s [<<<]" % (self.identifier))
        #if self.backup and os.path.exists(self.dir + "results.pkl"):
        #return

        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.structurize(
        )
        (value_ind, dictio_of_users, dictio_of_values) = tuple([None] * 3)
        sparse_matrix = self.calculate_tfidf(sparse_matrix)

        sparse_matrix_dot = self.compute_matrix_mult(sparse_matrix)
        #return sparse_matrix_dot
        sparse_matrix = None
        self.get_information_from_matrix(user_ind, sparse_matrix_dot)

        self.pprint("[>>>] Ended with %s [<<<]" % (self.identifier), get_ram(),
                    get_elapsed_time(tic))
        self.pprint(self.log)
Exemple #12
0
    def gen_data(self):
        tic = time.time()
        #Create the path for storing the dictionaries
        user_ind_p = self.dir + 'user_ind.pkl'
        value_ind_p = self.dir + 'value_ind.pkl'
        dictio_of_users_p = self.dir + 'dictio_of_users.pkl'
        dictio_of_values_p = self.dir + 'dictio_of_values.pkl'
        dictio_of_usage_p = self.dir + 'dictio_of_usage.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(user_ind_p), self.cleanup_list.append(
            value_ind_p), self.cleanup_list.append(
                dictio_of_users_p), self.cleanup_list.append(
                    dictio_of_values_p), self.cleanup_list.append(
                        dictio_of_usage_p)

        if self.backup and os.path.exists(user_ind_p) and os.path.exists(
                value_ind_p
        ) and os.path.exists(dictio_of_users_p) and os.path.exists(
                dictio_of_values_p) and os.path.exists(dictio_of_usage_p):
            self.pprint("Data Structures already exist, unpickling.", end='\r')
            user_ind = unpickle_object(user_ind_p)
            value_ind = unpickle_object(value_ind_p)
            dictio_of_users = unpickle_object(dictio_of_users_p)
            dictio_of_values = unpickle_object(dictio_of_values_p)
            # TODO Remove comment
            #dictio_of_usage = unpickle_object(dictio_of_usage_p)
            dictio_of_usage = None
            self.pprint("[END] Data Structures already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage

        lst = read_csv_list(self.data)[1:]

        tic = time.time()
        user_ind = {}
        value_ind = {}
        dictio_of_users = {}
        dictio_of_values = {}
        dictio_of_usage = {}
        total = len(lst)
        max_val = np.uint32(0)
        for uind, i in enumerate(lst):
            if uind % 1000 == 0:
                self.pprint("Data Structures Generation",
                            "[%d Users Processed]" % (uind),
                            "[%0.3f Percentage]" % ((uind / total) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            uind = np.uint32(uind)
            user_ind[i[0]] = uind
            user = i[0]
            dictio_of_users[uind] = []
            dictio_of_usage[uind] = []
            for t in i[1:]:
                value, usage = self.separate(t)
                usage = np.uint32(usage)
                if value not in value_ind:
                    value_ind[value] = max_val
                    dictio_of_values[max_val] = []
                    max_val += 1
                vind = value_ind[value]
                dictio_of_values[vind].append(uind)
                dictio_of_users[uind].append(vind)
                dictio_of_usage[uind].append(usage)
        self.pprint("[END] Data Structures Generation",
                    "[%d Users Processed]" % (uind),
                    "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(),
                    get_elapsed_time(tic))

        lst = None  # Freeing space from list, no longer needed

        #self.pprint("[0/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(user_ind, user_ind_p)
        #self.pprint("[1/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(value_ind, value_ind_p)
        #self.pprint("[2/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_users, dictio_of_users_p)
        #self.pprint("[3/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_values, dictio_of_values_p)
        #self.pprint("[4/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_usage, dictio_of_usage_p)
        #self.pprint("[END] [5/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage
Exemple #13
0
    def clean_matrix(self, sparse_matrix, user_ind, value_ind, dictio_of_users,
                     dictio_of_values):
        tic = time.time()

        #inv_user_ind = {v: k for k, v in user_ind.items()}
        #inv_value_ind = {v: k for k, v in value_ind.items()}

        sparse_matrix_p = self.dir + 'clean_sparse_matrix.pkl'
        user_ind_p = self.dir + 'clean_user_ind.pkl'
        value_ind_p = self.dir + 'clean_value_ind.pkl'
        dictio_of_users_p = self.dir + 'clean_dictio_of_users.pkl'
        dictio_of_values_p = self.dir + 'clean_dictio_of_values.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p), self.cleanup_list.append(
            user_ind_p
        ), self.cleanup_list.append(value_ind_p), self.cleanup_list.append(
            dictio_of_users_p), self.cleanup_list.append(dictio_of_values_p)

        if self.backup and os.path.exists(sparse_matrix_p) and os.path.exists(
                user_ind_p) and os.path.exists(value_ind_p) and os.path.exists(
                    dictio_of_users_p) and os.path.exists(dictio_of_values_p):
            self.pprint("Clean data already exist, unpickling.", end='\r')
            user_ind = unpickle_object(user_ind_p)
            value_ind = unpickle_object(value_ind_p)
            dictio_of_users = unpickle_object(dictio_of_users_p)
            dictio_of_values = unpickle_object(dictio_of_values_p)
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Clean data already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values

        user_set = set(dictio_of_users.keys())
        self.pprint("Taking values that appear once.",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        value_set = set([k for k, v in dictio_of_values.items() if len(v) > 1])
        self.pprint(
            "[END] Taking values that appear once",
            "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
            (len(value_set), len(user_set), len(value_set)), get_ram(),
            get_elapsed_time(tic))

        # We execute all user removal procedures specified
        if not self.user_removal is None:
            for ind, procedure in enumerate(self.user_removal):
                self.pprint("Executing user removal procedure [%d] " % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                user_list = procedure(user_ind, value_ind, dictio_of_users,
                                      dictio_of_values)
                user_set = user_set.intersection(set(user_list))
                self.pprint(
                    "[END] Executing user removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(user_list), len(user_set), len(value_set)), get_ram(),
                    get_elapsed_time(tic))

        # We execute all value removal procedures specified by the user
        if not self.value_removal is None:
            for ind, procedure in enumerate(self.value_removal):
                self.pprint("Executing value removal procedure [%d]" % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                value_list = procedure(user_ind, value_ind, dictio_of_users,
                                       dictio_of_values)
                value_set = value_set.intersection(set(value_list))
                self.pprint(
                    "[END] Executing value removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(value_list), len(user_set), len(value_set)),
                    get_ram(), get_elapsed_time(tic))

        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        self.pprint("Obtaining empty data", end='\r')
        user_set = set(dictio_of_users.keys())
        value_set = set(dictio_of_values.keys())
        user_set_rem = set([
            uind for uind, vinds in dictio_of_users.items() if len(vinds) == 0
        ])
        value_set_rem = set([
            vind for vind, uinds in dictio_of_values.items() if len(uinds) == 0
        ])
        user_set = user_set.difference(user_set_rem)
        value_set = value_set.difference(value_set_rem)
        self.pprint("[END] Obtaining empty data")
        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        pickle_object(sparse_matrix, sparse_matrix_p)
        pickle_object(user_ind, user_ind_p)
        pickle_object(value_ind, value_ind_p)
        pickle_object(dictio_of_users, dictio_of_users_p)
        pickle_object(dictio_of_values, dictio_of_values_p)
        return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values
Exemple #14
0
    def remove_user_value_set(self, sparse_matrix, user_ind, value_ind,
                              dictio_of_users, dictio_of_values, user_set,
                              value_set):
        tic = time.time()
        # We remove the data from the matrix
        users, values = sparse_matrix.shape
        rem_users = sorted(list(user_set))
        rem_values = sorted(list(value_set))
        self.pprint("[STATUS] [Original] Users: [%d] Values: [%d]" %
                    (users, values))
        sparse_matrix = sparse_matrix[rem_users, :]
        users, values = sparse_matrix.shape
        self.pprint(
            "[STATUS] [User Removal] Users: [%d] Values: [%d]" %
            (users, values), get_ram(), get_elapsed_time(tic))
        sparse_matrix = sparse_matrix[:, rem_values]
        users, values = sparse_matrix.shape
        self.pprint(
            "[STATUS] [Value Removal] Users: [%d] Values: [%d]" %
            (users, values), get_ram(), get_elapsed_time(tic))

        # Users to remove
        self.pprint("Generating new user data.",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        temp = {
            user: uind
            for user, uind in user_ind.items() if uind in user_set
        }  # Remove not needed users
        #print(len(temp))
        trans_user_dict = {
            uind: new_uind
            for new_uind, (_, uind) in enumerate(temp.items())
        }  # Translate from current index to new index
        #print(len(temp), len(trans_user_dict))
        user_ind = {
            user: trans_user_dict[uind]
            for user, uind in temp.items()
        }  # Update user_ind with new indexes
        self.pprint("[END] Generating new user data: %d" % (len(user_ind)),
                    get_ram(), get_elapsed_time(tic))
        self.pprint("Generating new value data.",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        temp = {
            value: vind
            for value, vind in value_ind.items() if vind in value_set
        }  # Remove not needed values
        trans_value_dict = {
            vind: new_vind
            for new_vind, (_, vind) in enumerate(temp.items())
        }  # Translate from current index to new index
        value_ind = {
            value: trans_value_dict[vind]
            for value, vind in temp.items()
        }  # Update the value_ind with new indexes
        self.pprint("[END] Generating new value data: %d" % (len(value_ind)),
                    get_ram(), get_elapsed_time(tic))

        dictio_of_users = {
            trans_user_dict[uind]:
            [trans_value_dict[vind] for vind in vinds if vind in value_set]
            for uind, vinds in dictio_of_users.items() if uind in user_set
        }
        dictio_of_values = {
            trans_value_dict[vind]:
            [trans_user_dict[uind] for uind in uinds if uind in user_set]
            for vind, uinds in dictio_of_values.items() if vind in value_set
        }

        return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values