Example #1
0
def gen_coincidences(do):
	directory = "multfs_users/"
	lst_users = read_csv_list("multfs.csv")[1:]
	create_dir(directory)
	dictios_of_users = []
	value_inds = []
	user_inds = []
	
	pairs = lst_users[:5]

	for _id in do:
		dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl")
		dictios_of_users.append(dictio_of_users)
		print(len(dictio_of_users))
		value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl")
		value_inds.append(value_ind)
		user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl")
		user_inds.append(user_ind)

	for index, (u1, u2, _, _, _, _) in enumerate(pairs):
		print("Going for %d" % (index), u1, u2)
		uname1, rg1, lp1 = get_username(u1)
		uname2, rg2, lp2 = get_username(u2)
		directory2 = directory + "%s(%s)-%s(%s)/" %(u1, uname1, u2, uname2)
		create_dir(directory2)
		coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds)
		print(coins)
		gen_post_coincidences(coins, u1, u2, directory2)
Example #2
0
    def fill_results_for_id(self, _id, join_dict):
        tic = time.time()
        # Paths to directories
        user_ind_p = _id + "_files/clean_user_ind.pkl"
        sparse_matrix_dot_p = _id + "_files/sparse_matrix_dot_a.pkl"
        # Unpickling objects
        user_ind = unpickle_object(user_ind_p)
        sparse_matrix_dot = unpickle_object(sparse_matrix_dot_p)
        #inv_user_ind = {v:k for k, v in  user_ind.items()}
        sparse_matrix_dot = sparse_matrix_dot  #.tocoo() # Transform to coordinate matrix for this

        # Statistics retrieval
        total_pairs = len(join_dict)
        elements_written = 0
        for i, k in enumerate(join_dict.keys()):
            if i % 1000 == 0:
                self.pprint("[%s] Joining Results" % (_id),
                            "[%d Pairs Processed]" % (i),
                            "[%d Elements Written]" % (elements_written),
                            "[%0.3f Percentage]" % ((i / total_pairs) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            u1, u2 = k  #Extracting users
            if u1 in user_ind and u2 in user_ind:
                elements_written += 1
                join_dict[k][_id] = sparse_matrix_dot[user_ind[u1],
                                                      user_ind[u2]]

        self.pprint("[END] [%s] Joining Results" % (_id),
                    "[%d Pairs Processed]" % (i),
                    "[%d Elements Written]" % (elements_written),
                    "[%0.3f Percentage]" % ((i / total_pairs) * 100),
                    get_ram(), get_elapsed_time(tic))
        return join_dict
Example #3
0
    def join_all_results(self):
        join_dict_p = "join_dict.pkl"

        join_dict = dict()
        for _id in self.ids:
            filename = _id + "_files/results.pkl"
            results = unpickle_object(filename)
            self.total[_id] = len(results)
            self.pprint("[%s] Total results: %d" % (_id, self.total[_id]))
            for res in results:

                u1_u2_tuple = res[:2]
                if not u1_u2_tuple in join_dict:
                    join_dict[u1_u2_tuple] = dict()
                #join_dict[u1_u2_tuple][_id] = res[:2]

        if self.backup and os.path.exists(join_dict_p):
            join_dict = unpickle_object(join_dict_p)
            return join_dict

        for _id in self.ids:
            join_dict = self.fill_results_for_id(_id, join_dict)

        pickle_object(join_dict, join_dict_p)
        return join_dict
Example #4
0
def user_removal_based_on_participation():

    keep_users_p = 'num_files/keep_users.pkl'
    if os.path.exists(keep_users_p):
        #print("\tUser participation extraction exists", end='\r')
        keep_users = unpickle_object(keep_users_p)
        #print("\t[END] User participation extraction finished [%d Users to keep]" % (len(keep_users)))
        return keep_users

    lst = read_csv_list('num_files/user_to_num.csv')[1:]
    #print("\t[-] User participation detection.", end='\r')
    users = [i[0] for i in lst]
    # Number of posts per user
    x = np.array([int(x[1]) for x in lst])
    # Characters per user post
    y = [np.array([int(y) for y in x[2:]]) for x in lst]

    # Average characters per post of a user
    z = np.array([i.mean() for i in y if len(i) > 0])
    keep_users = set()
    limi = np.quantile(x, .50)
    limk = np.quantile(z, .50)
    for user, i, k in zip(users, x, z):
        if i > limi or k > limk:
            keep_users.add(user)

    pickle_object(keep_users, keep_users_p)
    #print('[END] Extracted all the user participations [%d]' % (len(keep_users)))
    return keep_users
Example #5
0
    def gen_sparse_matrix(self, dictio_of_users, dictio_of_usage, num_values):
        tic = time.time()
        sparse_matrix_p = self.dir + 'sparse_matrix.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p)

        if self.backup and os.path.exists(sparse_matrix_p):
            self.pprint("Sparse Matrix already exist, unpickling.", end='\r')
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Sparse Matrix already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix

        num_users = len(dictio_of_users)
        rows = []
        cols = []
        data = []
        for ind, row in enumerate(dictio_of_users.items()):
            if ind % 1000 == 0:
                self.pprint("Sparse Matrix Generation",
                            "[%d Users Processed]" % (ind),
                            "[%0.3f Percentage]" % ((ind / num_users) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            uind, values = row[0], row[1]
            usages = dictio_of_usage[uind]
            for value, usage in zip(values, usages):
                rows.append(uind)
                cols.append(value)
                data.append(usage)

        sparse_matrix = csr_matrix((data, (rows, cols)),
                                   shape=(num_users, num_values),
                                   dtype=self.dtype)
        self.pprint("[END] Sparse Matrix Generation",
                    "[%d Users Processed]" % (ind),
                    "[%0.3f Percentage]" % ((ind / num_users) * 100),
                    get_ram(), get_elapsed_time(tic))
        pickle_object(sparse_matrix, sparse_matrix_p)
        return sparse_matrix
Example #6
0
    def gen_matrix(self, join_dict):

        matrix_p = "matrix.pkl"
        self.pprint("Matrix generation", end='\r')
        if self.backup and os.path.exists(matrix_p):
            self.pprint("[END] Matrix generation, already existed")
            matrix = unpickle_object(matrix_p)
            return matrix

        num_pairs = len(join_dict)
        num_features = len(self.ids)
        matrix = np.zeros((num_pairs, num_features), dtype=np.uint32)
        for i, (pair, feature_dict) in enumerate(join_dict.items()):
            for j, (feature) in enumerate(self.ids):
                if feature in feature_dict:
                    matrix[i][j] = feature_dict[feature]

        self.pprint("[END] Matrix generation")
        pickle_object(matrix, matrix_p)
        return matrix
Example #7
0
    def gen_data(self):
        tic = time.time()
        #Create the path for storing the dictionaries
        user_ind_p = self.dir + 'user_ind.pkl'
        value_ind_p = self.dir + 'value_ind.pkl'
        dictio_of_users_p = self.dir + 'dictio_of_users.pkl'
        dictio_of_values_p = self.dir + 'dictio_of_values.pkl'
        dictio_of_usage_p = self.dir + 'dictio_of_usage.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(user_ind_p), self.cleanup_list.append(
            value_ind_p), self.cleanup_list.append(
                dictio_of_users_p), self.cleanup_list.append(
                    dictio_of_values_p), self.cleanup_list.append(
                        dictio_of_usage_p)

        if self.backup and os.path.exists(user_ind_p) and os.path.exists(
                value_ind_p
        ) and os.path.exists(dictio_of_users_p) and os.path.exists(
                dictio_of_values_p) and os.path.exists(dictio_of_usage_p):
            self.pprint("Data Structures already exist, unpickling.", end='\r')
            user_ind = unpickle_object(user_ind_p)
            value_ind = unpickle_object(value_ind_p)
            dictio_of_users = unpickle_object(dictio_of_users_p)
            dictio_of_values = unpickle_object(dictio_of_values_p)
            # TODO Remove comment
            #dictio_of_usage = unpickle_object(dictio_of_usage_p)
            dictio_of_usage = None
            self.pprint("[END] Data Structures already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage

        lst = read_csv_list(self.data)[1:]

        tic = time.time()
        user_ind = {}
        value_ind = {}
        dictio_of_users = {}
        dictio_of_values = {}
        dictio_of_usage = {}
        total = len(lst)
        max_val = np.uint32(0)
        for uind, i in enumerate(lst):
            if uind % 1000 == 0:
                self.pprint("Data Structures Generation",
                            "[%d Users Processed]" % (uind),
                            "[%0.3f Percentage]" % ((uind / total) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            uind = np.uint32(uind)
            user_ind[i[0]] = uind
            user = i[0]
            dictio_of_users[uind] = []
            dictio_of_usage[uind] = []
            for t in i[1:]:
                value, usage = self.separate(t)
                usage = np.uint32(usage)
                if value not in value_ind:
                    value_ind[value] = max_val
                    dictio_of_values[max_val] = []
                    max_val += 1
                vind = value_ind[value]
                dictio_of_values[vind].append(uind)
                dictio_of_users[uind].append(vind)
                dictio_of_usage[uind].append(usage)
        self.pprint("[END] Data Structures Generation",
                    "[%d Users Processed]" % (uind),
                    "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(),
                    get_elapsed_time(tic))

        lst = None  # Freeing space from list, no longer needed

        #self.pprint("[0/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(user_ind, user_ind_p)
        #self.pprint("[1/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(value_ind, value_ind_p)
        #self.pprint("[2/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_users, dictio_of_users_p)
        #self.pprint("[3/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_values, dictio_of_values_p)
        #self.pprint("[4/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        #pickle_object(dictio_of_usage, dictio_of_usage_p)
        #self.pprint("[END] [5/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic))
        return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage
Example #8
0
    def clean_matrix(self, sparse_matrix, user_ind, value_ind, dictio_of_users,
                     dictio_of_values):
        tic = time.time()

        #inv_user_ind = {v: k for k, v in user_ind.items()}
        #inv_value_ind = {v: k for k, v in value_ind.items()}

        sparse_matrix_p = self.dir + 'clean_sparse_matrix.pkl'
        user_ind_p = self.dir + 'clean_user_ind.pkl'
        value_ind_p = self.dir + 'clean_value_ind.pkl'
        dictio_of_users_p = self.dir + 'clean_dictio_of_users.pkl'
        dictio_of_values_p = self.dir + 'clean_dictio_of_values.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p), self.cleanup_list.append(
            user_ind_p
        ), self.cleanup_list.append(value_ind_p), self.cleanup_list.append(
            dictio_of_users_p), self.cleanup_list.append(dictio_of_values_p)

        if self.backup and os.path.exists(sparse_matrix_p) and os.path.exists(
                user_ind_p) and os.path.exists(value_ind_p) and os.path.exists(
                    dictio_of_users_p) and os.path.exists(dictio_of_values_p):
            self.pprint("Clean data already exist, unpickling.", end='\r')
            user_ind = unpickle_object(user_ind_p)
            value_ind = unpickle_object(value_ind_p)
            dictio_of_users = unpickle_object(dictio_of_users_p)
            dictio_of_values = unpickle_object(dictio_of_values_p)
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Clean data already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values

        user_set = set(dictio_of_users.keys())
        self.pprint("Taking values that appear once.",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        value_set = set([k for k, v in dictio_of_values.items() if len(v) > 1])
        self.pprint(
            "[END] Taking values that appear once",
            "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
            (len(value_set), len(user_set), len(value_set)), get_ram(),
            get_elapsed_time(tic))

        # We execute all user removal procedures specified
        if not self.user_removal is None:
            for ind, procedure in enumerate(self.user_removal):
                self.pprint("Executing user removal procedure [%d] " % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                user_list = procedure(user_ind, value_ind, dictio_of_users,
                                      dictio_of_values)
                user_set = user_set.intersection(set(user_list))
                self.pprint(
                    "[END] Executing user removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(user_list), len(user_set), len(value_set)), get_ram(),
                    get_elapsed_time(tic))

        # We execute all value removal procedures specified by the user
        if not self.value_removal is None:
            for ind, procedure in enumerate(self.value_removal):
                self.pprint("Executing value removal procedure [%d]" % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                value_list = procedure(user_ind, value_ind, dictio_of_users,
                                       dictio_of_values)
                value_set = value_set.intersection(set(value_list))
                self.pprint(
                    "[END] Executing value removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(value_list), len(user_set), len(value_set)),
                    get_ram(), get_elapsed_time(tic))

        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        self.pprint("Obtaining empty data", end='\r')
        user_set = set(dictio_of_users.keys())
        value_set = set(dictio_of_values.keys())
        user_set_rem = set([
            uind for uind, vinds in dictio_of_users.items() if len(vinds) == 0
        ])
        value_set_rem = set([
            vind for vind, uinds in dictio_of_values.items() if len(uinds) == 0
        ])
        user_set = user_set.difference(user_set_rem)
        value_set = value_set.difference(value_set_rem)
        self.pprint("[END] Obtaining empty data")
        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        pickle_object(sparse_matrix, sparse_matrix_p)
        pickle_object(user_ind, user_ind_p)
        pickle_object(value_ind, value_ind_p)
        pickle_object(dictio_of_users, dictio_of_users_p)
        pickle_object(dictio_of_values, dictio_of_values_p)
        return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values
Example #9
0
def gen_latex_coincidences(do,specific_users=[]):
	directory = "multfs_users/"
	lst_users = read_csv_list("multfs.csv")[1:]
	create_dir(directory)
	dictios_of_users = []
	value_inds = []
	user_inds = []
	file = open('analysis.tex', 'w+')
	header = """\\documentclass[12pt]{article}

\\usepackage[utf8]{inputenc}
\\usepackage[T1]{fontenc}
\\usepackage[USenglish]{babel}

\\usepackage{xcolor} % Colors
\\usepackage{tabularx} % Other type of columns
\\usepackage{caption} 
\\usepackage{hyperref}
\\renewcommand{\\baselinestretch}{1.3}


\\usepackage{minted}

\\title{Found pairs of users}
\\author{-}
\\date{}
\\begin{document}

\\maketitle\n"""

	footer = """
\\end{document}"""

	file.write(header)
	# If not specific users are given, take the first 5. Otherwise, take those from the list
	if specific_users is None:
		pairs = lst_users[:5]
	else:
		pairs=[]
		for tuple_list in lst_users:
			(u1, u2, _, _, _, _)=tuple_list
			for specific_user in specific_users:
				if (u1==specific_user and u2 in specific_users) or (u2==specific_user and u1 in specific_users):
					pairs.append(tuple_list)

	for _id in do:

		dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl")
		dictios_of_users.append(dictio_of_users)
		
		value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl")
		value_inds.append(value_ind)
		user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl")
		user_inds.append(user_ind)
		print("ID: %s" %(_id), len(dictio_of_users), len(user_ind))

	for index, (u1, u2, _, _, _, _) in enumerate(pairs):
		#file1name = "tex/%d.tex"%(index)
		#file1 = open(file1name, 'w+')
		print("Going for %d" % (index), u1, u2)
		uname1, rg1, lp1 = get_username(u1) 
		uname2, rg2, lp2 = get_username(u2)
		file.write("\\section{%s(%s)-%s(%s)} \n" %(u1, uname1, u2, uname2))
		#file.write("\\include{%s}\n" % (file1name))
		coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds)
		#print("COINCIDENCES", coins)
		gen_latex_post_coincidences(coins, u1, u2, file)

	file.write(footer)
	file.close()