def gen_coincidences(do): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] pairs = lst_users[:5] for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) print(len(dictio_of_users)) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) for index, (u1, u2, _, _, _, _) in enumerate(pairs): print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) directory2 = directory + "%s(%s)-%s(%s)/" %(u1, uname1, u2, uname2) create_dir(directory2) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) print(coins) gen_post_coincidences(coins, u1, u2, directory2)
def fill_results_for_id(self, _id, join_dict): tic = time.time() # Paths to directories user_ind_p = _id + "_files/clean_user_ind.pkl" sparse_matrix_dot_p = _id + "_files/sparse_matrix_dot_a.pkl" # Unpickling objects user_ind = unpickle_object(user_ind_p) sparse_matrix_dot = unpickle_object(sparse_matrix_dot_p) #inv_user_ind = {v:k for k, v in user_ind.items()} sparse_matrix_dot = sparse_matrix_dot #.tocoo() # Transform to coordinate matrix for this # Statistics retrieval total_pairs = len(join_dict) elements_written = 0 for i, k in enumerate(join_dict.keys()): if i % 1000 == 0: self.pprint("[%s] Joining Results" % (_id), "[%d Pairs Processed]" % (i), "[%d Elements Written]" % (elements_written), "[%0.3f Percentage]" % ((i / total_pairs) * 100), get_ram(), get_elapsed_time(tic), end='\r') u1, u2 = k #Extracting users if u1 in user_ind and u2 in user_ind: elements_written += 1 join_dict[k][_id] = sparse_matrix_dot[user_ind[u1], user_ind[u2]] self.pprint("[END] [%s] Joining Results" % (_id), "[%d Pairs Processed]" % (i), "[%d Elements Written]" % (elements_written), "[%0.3f Percentage]" % ((i / total_pairs) * 100), get_ram(), get_elapsed_time(tic)) return join_dict
def join_all_results(self): join_dict_p = "join_dict.pkl" join_dict = dict() for _id in self.ids: filename = _id + "_files/results.pkl" results = unpickle_object(filename) self.total[_id] = len(results) self.pprint("[%s] Total results: %d" % (_id, self.total[_id])) for res in results: u1_u2_tuple = res[:2] if not u1_u2_tuple in join_dict: join_dict[u1_u2_tuple] = dict() #join_dict[u1_u2_tuple][_id] = res[:2] if self.backup and os.path.exists(join_dict_p): join_dict = unpickle_object(join_dict_p) return join_dict for _id in self.ids: join_dict = self.fill_results_for_id(_id, join_dict) pickle_object(join_dict, join_dict_p) return join_dict
def user_removal_based_on_participation(): keep_users_p = 'num_files/keep_users.pkl' if os.path.exists(keep_users_p): #print("\tUser participation extraction exists", end='\r') keep_users = unpickle_object(keep_users_p) #print("\t[END] User participation extraction finished [%d Users to keep]" % (len(keep_users))) return keep_users lst = read_csv_list('num_files/user_to_num.csv')[1:] #print("\t[-] User participation detection.", end='\r') users = [i[0] for i in lst] # Number of posts per user x = np.array([int(x[1]) for x in lst]) # Characters per user post y = [np.array([int(y) for y in x[2:]]) for x in lst] # Average characters per post of a user z = np.array([i.mean() for i in y if len(i) > 0]) keep_users = set() limi = np.quantile(x, .50) limk = np.quantile(z, .50) for user, i, k in zip(users, x, z): if i > limi or k > limk: keep_users.add(user) pickle_object(keep_users, keep_users_p) #print('[END] Extracted all the user participations [%d]' % (len(keep_users))) return keep_users
def gen_sparse_matrix(self, dictio_of_users, dictio_of_usage, num_values): tic = time.time() sparse_matrix_p = self.dir + 'sparse_matrix.pkl' #Adding files to list for cleanup self.cleanup_list.append(sparse_matrix_p) if self.backup and os.path.exists(sparse_matrix_p): self.pprint("Sparse Matrix already exist, unpickling.", end='\r') sparse_matrix = unpickle_object(sparse_matrix_p) self.pprint("[END] Sparse Matrix already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return sparse_matrix num_users = len(dictio_of_users) rows = [] cols = [] data = [] for ind, row in enumerate(dictio_of_users.items()): if ind % 1000 == 0: self.pprint("Sparse Matrix Generation", "[%d Users Processed]" % (ind), "[%0.3f Percentage]" % ((ind / num_users) * 100), get_ram(), get_elapsed_time(tic), end='\r') uind, values = row[0], row[1] usages = dictio_of_usage[uind] for value, usage in zip(values, usages): rows.append(uind) cols.append(value) data.append(usage) sparse_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_values), dtype=self.dtype) self.pprint("[END] Sparse Matrix Generation", "[%d Users Processed]" % (ind), "[%0.3f Percentage]" % ((ind / num_users) * 100), get_ram(), get_elapsed_time(tic)) pickle_object(sparse_matrix, sparse_matrix_p) return sparse_matrix
def gen_matrix(self, join_dict): matrix_p = "matrix.pkl" self.pprint("Matrix generation", end='\r') if self.backup and os.path.exists(matrix_p): self.pprint("[END] Matrix generation, already existed") matrix = unpickle_object(matrix_p) return matrix num_pairs = len(join_dict) num_features = len(self.ids) matrix = np.zeros((num_pairs, num_features), dtype=np.uint32) for i, (pair, feature_dict) in enumerate(join_dict.items()): for j, (feature) in enumerate(self.ids): if feature in feature_dict: matrix[i][j] = feature_dict[feature] self.pprint("[END] Matrix generation") pickle_object(matrix, matrix_p) return matrix
def gen_data(self): tic = time.time() #Create the path for storing the dictionaries user_ind_p = self.dir + 'user_ind.pkl' value_ind_p = self.dir + 'value_ind.pkl' dictio_of_users_p = self.dir + 'dictio_of_users.pkl' dictio_of_values_p = self.dir + 'dictio_of_values.pkl' dictio_of_usage_p = self.dir + 'dictio_of_usage.pkl' #Adding files to list for cleanup self.cleanup_list.append(user_ind_p), self.cleanup_list.append( value_ind_p), self.cleanup_list.append( dictio_of_users_p), self.cleanup_list.append( dictio_of_values_p), self.cleanup_list.append( dictio_of_usage_p) if self.backup and os.path.exists(user_ind_p) and os.path.exists( value_ind_p ) and os.path.exists(dictio_of_users_p) and os.path.exists( dictio_of_values_p) and os.path.exists(dictio_of_usage_p): self.pprint("Data Structures already exist, unpickling.", end='\r') user_ind = unpickle_object(user_ind_p) value_ind = unpickle_object(value_ind_p) dictio_of_users = unpickle_object(dictio_of_users_p) dictio_of_values = unpickle_object(dictio_of_values_p) # TODO Remove comment #dictio_of_usage = unpickle_object(dictio_of_usage_p) dictio_of_usage = None self.pprint("[END] Data Structures already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage lst = read_csv_list(self.data)[1:] tic = time.time() user_ind = {} value_ind = {} dictio_of_users = {} dictio_of_values = {} dictio_of_usage = {} total = len(lst) max_val = np.uint32(0) for uind, i in enumerate(lst): if uind % 1000 == 0: self.pprint("Data Structures Generation", "[%d Users Processed]" % (uind), "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(), get_elapsed_time(tic), end='\r') uind = np.uint32(uind) user_ind[i[0]] = uind user = i[0] dictio_of_users[uind] = [] dictio_of_usage[uind] = [] for t in i[1:]: value, usage = self.separate(t) usage = np.uint32(usage) if value not in value_ind: value_ind[value] = max_val dictio_of_values[max_val] = [] max_val += 1 vind = value_ind[value] dictio_of_values[vind].append(uind) dictio_of_users[uind].append(vind) dictio_of_usage[uind].append(usage) self.pprint("[END] Data Structures Generation", "[%d Users Processed]" % (uind), "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(), get_elapsed_time(tic)) lst = None # Freeing space from list, no longer needed #self.pprint("[0/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(user_ind, user_ind_p) #self.pprint("[1/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(value_ind, value_ind_p) #self.pprint("[2/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_users, dictio_of_users_p) #self.pprint("[3/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_values, dictio_of_values_p) #self.pprint("[4/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_usage, dictio_of_usage_p) #self.pprint("[END] [5/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage
def clean_matrix(self, sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values): tic = time.time() #inv_user_ind = {v: k for k, v in user_ind.items()} #inv_value_ind = {v: k for k, v in value_ind.items()} sparse_matrix_p = self.dir + 'clean_sparse_matrix.pkl' user_ind_p = self.dir + 'clean_user_ind.pkl' value_ind_p = self.dir + 'clean_value_ind.pkl' dictio_of_users_p = self.dir + 'clean_dictio_of_users.pkl' dictio_of_values_p = self.dir + 'clean_dictio_of_values.pkl' #Adding files to list for cleanup self.cleanup_list.append(sparse_matrix_p), self.cleanup_list.append( user_ind_p ), self.cleanup_list.append(value_ind_p), self.cleanup_list.append( dictio_of_users_p), self.cleanup_list.append(dictio_of_values_p) if self.backup and os.path.exists(sparse_matrix_p) and os.path.exists( user_ind_p) and os.path.exists(value_ind_p) and os.path.exists( dictio_of_users_p) and os.path.exists(dictio_of_values_p): self.pprint("Clean data already exist, unpickling.", end='\r') user_ind = unpickle_object(user_ind_p) value_ind = unpickle_object(value_ind_p) dictio_of_users = unpickle_object(dictio_of_users_p) dictio_of_values = unpickle_object(dictio_of_values_p) sparse_matrix = unpickle_object(sparse_matrix_p) self.pprint("[END] Clean data already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values user_set = set(dictio_of_users.keys()) self.pprint("Taking values that appear once.", get_ram(), get_elapsed_time(tic), end='\r') value_set = set([k for k, v in dictio_of_values.items() if len(v) > 1]) self.pprint( "[END] Taking values that appear once", "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(value_set), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) # We execute all user removal procedures specified if not self.user_removal is None: for ind, procedure in enumerate(self.user_removal): self.pprint("Executing user removal procedure [%d] " % (ind), get_ram(), get_elapsed_time(tic), end='\r') user_list = procedure(user_ind, value_ind, dictio_of_users, dictio_of_values) user_set = user_set.intersection(set(user_list)) self.pprint( "[END] Executing user removal procedure [%d]" % (ind + 1), "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(user_list), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) # We execute all value removal procedures specified by the user if not self.value_removal is None: for ind, procedure in enumerate(self.value_removal): self.pprint("Executing value removal procedure [%d]" % (ind), get_ram(), get_elapsed_time(tic), end='\r') value_list = procedure(user_ind, value_ind, dictio_of_users, dictio_of_values) value_set = value_set.intersection(set(value_list)) self.pprint( "[END] Executing value removal procedure [%d]" % (ind + 1), "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(value_list), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set( sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values, user_set, value_set) self.pprint("Obtaining empty data", end='\r') user_set = set(dictio_of_users.keys()) value_set = set(dictio_of_values.keys()) user_set_rem = set([ uind for uind, vinds in dictio_of_users.items() if len(vinds) == 0 ]) value_set_rem = set([ vind for vind, uinds in dictio_of_values.items() if len(uinds) == 0 ]) user_set = user_set.difference(user_set_rem) value_set = value_set.difference(value_set_rem) self.pprint("[END] Obtaining empty data") sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set( sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values, user_set, value_set) pickle_object(sparse_matrix, sparse_matrix_p) pickle_object(user_ind, user_ind_p) pickle_object(value_ind, value_ind_p) pickle_object(dictio_of_users, dictio_of_users_p) pickle_object(dictio_of_values, dictio_of_values_p) return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values
def gen_latex_coincidences(do,specific_users=[]): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] file = open('analysis.tex', 'w+') header = """\\documentclass[12pt]{article} \\usepackage[utf8]{inputenc} \\usepackage[T1]{fontenc} \\usepackage[USenglish]{babel} \\usepackage{xcolor} % Colors \\usepackage{tabularx} % Other type of columns \\usepackage{caption} \\usepackage{hyperref} \\renewcommand{\\baselinestretch}{1.3} \\usepackage{minted} \\title{Found pairs of users} \\author{-} \\date{} \\begin{document} \\maketitle\n""" footer = """ \\end{document}""" file.write(header) # If not specific users are given, take the first 5. Otherwise, take those from the list if specific_users is None: pairs = lst_users[:5] else: pairs=[] for tuple_list in lst_users: (u1, u2, _, _, _, _)=tuple_list for specific_user in specific_users: if (u1==specific_user and u2 in specific_users) or (u2==specific_user and u1 in specific_users): pairs.append(tuple_list) for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) print("ID: %s" %(_id), len(dictio_of_users), len(user_ind)) for index, (u1, u2, _, _, _, _) in enumerate(pairs): #file1name = "tex/%d.tex"%(index) #file1 = open(file1name, 'w+') print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) file.write("\\section{%s(%s)-%s(%s)} \n" %(u1, uname1, u2, uname2)) #file.write("\\include{%s}\n" % (file1name)) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) #print("COINCIDENCES", coins) gen_latex_post_coincidences(coins, u1, u2, file) file.write(footer) file.close()