def plot_network_chartz(): i = 0 lst = read_csv_list("results_email_comp.csv")[1:1000] lst = [x for x in lst if float(x[-1]) < float(10000)] print(len(lst)) df = pd.DataFrame({'from': [x[0] for x in lst], 'to': [x[1] for x in lst]}) # Build your graph #G=nx.from_pandas_dataframe(df, 'from', 'to') G = nx.from_pandas_edgelist(df, 'from', 'to') plt.figure(figsize=(50, 50)) node_color = [100 * G.degree(node) for node in G] node_size = [1000 * G.degree(node) for node in G] #pos = nx.spring_layout(G, k=0.04) graph = nx.draw_spring(G, k=0.14, with_labels=True, node_size=node_size, node_color=node_color, node_shape="o", alpha=0.5, linewidths=4, font_size=25, font_color="black", font_weight="bold", width=2, edge_color="grey") plt.savefig("graphs/Graph_Email.png", format="PNG")
def gen_coincidences(do): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] pairs = lst_users[:5] for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) print(len(dictio_of_users)) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) for index, (u1, u2, _, _, _, _) in enumerate(pairs): print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) directory2 = directory + "%s(%s)-%s(%s)/" %(u1, uname1, u2, uname2) create_dir(directory2) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) print(coins) gen_post_coincidences(coins, u1, u2, directory2)
def join_all_results(self): dictio_of_results = dict() tic = time.time() toc = time.time() for indi, tup in enumerate(self.list_files): filename, prefix = tup[0], tup[1] print("[-] Going for file: %d - %s" % (indi, filename)) lst_results = read_csv_list(filename)[1:] filelen = len(lst_results) print("[+] Sorting list") lst_results = sorted(lst_results, key=lambda x: x[0] + x[1], reverse=False) status.create_numbar(100, filelen) for indj, entry in enumerate(lst_results): status.update_numbar(indj, filelen) if not entry[0] in dictio_of_results.keys(): dictio_of_results[entry[0]] = dict() if not entry[1] in dictio_of_results[entry[0]].keys(): dictio_of_results[entry[0]][entry[1]] = dict() dictio_of_results[entry[0]][entry[1]][prefix] = float(entry[2]) status.end_numbar() print("[+] Ended with file: %d - %s in %d seconds" % (indi, filename, time.time() - tic)) return dictio_of_results
def simplify_list(): dirname = 'trigram_files/' lst = read_csv_list(dirname + "user_to_trigrams_complex.csv")[1:] lst = [(x[0], ) + tuple([int(y.split(':')[0]) for y in x[1:]]) for x in lst] gen_csv_from_tuples(dirname + 'user_to_trigrams.csv', ['user', 'trigrams_#'], lst)
def user_removal_based_on_participation(): keep_users_p = 'num_files/keep_users.pkl' if os.path.exists(keep_users_p): #print("\tUser participation extraction exists", end='\r') keep_users = unpickle_object(keep_users_p) #print("\t[END] User participation extraction finished [%d Users to keep]" % (len(keep_users))) return keep_users lst = read_csv_list('num_files/user_to_num.csv')[1:] #print("\t[-] User participation detection.", end='\r') users = [i[0] for i in lst] # Number of posts per user x = np.array([int(x[1]) for x in lst]) # Characters per user post y = [np.array([int(y) for y in x[2:]]) for x in lst] # Average characters per post of a user z = np.array([i.mean() for i in y if len(i) > 0]) keep_users = set() limi = np.quantile(x, .50) limk = np.quantile(z, .50) for user, i, k in zip(users, x, z): if i > limi or k > limk: keep_users.add(user) pickle_object(keep_users, keep_users_p) #print('[END] Extracted all the user participations [%d]' % (len(keep_users))) return keep_users
def read_list_with_format(self, filename): lst_users = read_csv_list(filename) for i in range(len(lst_users)): entry = list(lst_users[i]) for j in range(2, len(entry)): entry[j] = float(entry[j]) lst_users[i] = entry return lst_users
def plot_scoring_data2(): lst = read_csv_list("results_ip_comp.csv")[1:] data = [float(x[-1]) for x in lst] print(data[:10]) #sns_plot = sns.kdeplot(list(range(20)), shade=True) sns_plot = sns.distplot(data, kde=False, rug=True) fig = sns_plot.get_figure() fig.savefig("output.png")
def extract_all_users(): print("[-] Extracting users") lst = read_csv_list("weighted_average.csv")[1:] set_user = set() for i in lst: set_user.add(i[0]) set_user.add(i[1]) return list(set_user), [x[0:3] for x in lst]
def plot_connection_graph(): lst = read_csv_list("results_ip_comp.csv")[1:1000] lst = [x for x in lst if float(x[-1]) < float(10000)] links = pd.DataFrame({ 'source': [x[0] for x in lst], 'target': [x[1] for x in lst]}) chord = hv.Chord(links).select(value=(5, None)) chord.opts( opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source').str(), labels='name', node_color=dim('index').str()))
def basic_solve_find_solutions(filepath): data = read_csv_list(filepath) lst = [] for i, row1 in enumerate(data): user1, values1 = row1[0], row1[1:] for j, row2 in enumerate(data[i + 1:]): user2, values2 = row2[0], row2[1:] score = basic_score(values1, values2) lst.append((user1, user2, score)) return lst
def gen_new_dataset(): global global_lst #lst = read_csv_list("user_skypes.csv")[1:] lst = read_csv_list("skype_files/user_to_skype.csv")[1:] print("Length of the Dataset: %d" % (len(lst))) #pool = mp.Pool(processes=16) #lst = pool.map(modify_skype, lst) #clean_dataset(lst) #global_lst = sorted(lst, key=lambda x: len(x), reverse=True) return lst
def gen_ip_values(): lst_ips = read_csv_list("ip_files/ip_count.csv")[1:] lst_ips = sorted(lst_ips, key=lambda x: x[1], reverse=True) print(lst_ips[:3]) divisions = int(math.ceil(float(len(lst_ips)) / 254.0)) dictio = {} for i in range(0, 254): start = i * divisions end = (i + 1) * divisions for elem in lst_ips[start:end]: dictio[elem[0]] = (i + 1) print(dictio) return dictio
def get_most_important(): lst = read_csv_list("weighted_average.csv")[1:] dictio_lst = {} status.create_numbar(100, len(lst_res2)) for indi, i in enumerate(lst_res2): status.update_numbar(indi, len(lst_res2)) dictio_lst[i] = read_csv_list(i + "_files/user_to_" + i + ".csv")[1:] status.end_numbar() final_lst = [] num = 100 status.create_numbar(100, num) for indi, i in enumerate(lst[:num]): status.update_numbar(indi, num) userilist = [i[0], i[1], i[2]] for key in lst_res2: u1, u2 = find_user_list(i[0], i[1], dictio_lst[key]) if u1 is None or u2 is None: continue userilist += list(u1.intersection(u2)) final_lst.append(tuple(userilist)) status.end_numbar() gen_csv_from_tuples("croos_val.csv", ['user_a', 'user_b', 'metric', 'similar_vals'], final_lst)
def gen_skype_values(): lst_skypes = read_csv_list("skype_files/skype_count.csv")[1:] print("Lenght Skype Count: %d" % (len(lst_skypes))) lst_skypes = sorted(lst_skypes, key=lambda x: x[1], reverse=True) print(lst_skypes[:3]) divisions = int(math.ceil(float(len(lst_skypes)) / 254.0)) dictio = {} for i in range(0, 254): start = i * divisions end = (i + 1) * divisions for elem in lst_skypes[start:end]: dictio[elem[0]] = (i + 1) #print(dictio) return dictio
def generate_graph(): print("[-] Extracting data") lst = read_csv_list("weighted_average.csv")[1:] print("[-] Generating list") #from_nodes = [x[0] for x in lst] #to_nodes = [x[1] for x in lst] #weight = [x[2] for x in lst] elist = [(x[0], x[1], x[2]) for x in lst if float(x[2]) < 1.0] print("[-] Generating graph") G = nx.Graph() G.add_weighted_edges_from(elist) print("[-] Pickling") #nx.write_gpickle(G, "graph.pkl") nx.write_gexf(G, "graph.gexf") return G
def simplify(): import adhoc_removal from functional import seq files = [ 'trigram_files/user_to_trigram.csv', 'timestamp_files/user_to_timestamp.csv' ] keep_users = adhoc_removal.keep_users for file in files: a = read_csv_list(file) print("Initial length: %d" % (len(a))) a = seq(a).filter(lambda x: x[0] in keep_users).filter( lambda x: len(x) > 1) a = [tuple(x) for x in a] print("Final length: %d" % (len(a))) gen_csv_from_tuples(file + '_simple', ['IdAuthor', 'Features'], a)
def get_joined_results(self, filename): dictio_of_results = {} lst_results = read_csv_list(filename) head = lst_results[0] lst_results = lst_results[1:] for entry in lst_results: user0, user1 = entry[0], entry[1] for indi, prefix in enumerate(head[2:]): if not entry[0] in dictio_of_results.keys(): dictio_of_results[entry[0]] = dict() if not entry[1] in dictio_of_results[entry[0]].keys(): dictio_of_results[entry[0]][entry[1]] = dict() dictio_of_results[entry[0]][entry[1]][prefix] = float( entry[2 + indi]) return dictio_of_results, lst_results
def gen_dictio_from_csv(): global dictio_of_results lst = read_csv_list("combination.csv") headers, lst = lst[0][2:], lst[1:] status.create_numbar(100, len(lst)) for indi, i in enumerate(lst): status.update_numbar(indi, len(lst)) if not i[0] in dictio_of_results.keys(): dictio_of_results[i[0]] = {} if not i[1] in dictio_of_results[i[0]].keys(): dictio_of_results[i[0]][i[1]] = {} for indj, j in enumerate(i[2:]): dictio_of_results[i[0]][i[1]][headers[indj]] = j status.end_numbar() return lst
def generate_directories_for_users(): print("[>] Creating dir") create_dir("Author/") print("[>] Reading user csv list") lst_users = read_csv_list("weighted_average.csv")[1:] #lst_users = [(x[0], x[1] for x in lst_users if float(x[2]) < 0.35) ev_set = set() for entry in lst_users: if float(entry[2]) >= 0.35: break ev_set.add(entry[0]) ev_set.add(entry[1]) #status.create_numbar(100, len(ev_set)) for ind, user in enumerate(ev_set): #status.update_numbar(ind, len(ev_set)) generate_user_dataset(user, ind, len(ev_set))
def user_removal_based_on_participation(): print("[-] Starting user participation detection.") lst = read_csv_list('num_files/user_to_num.csv')[1:] users = [i[0] for i in lst] # Number of posts per user x = np.array([int(x[1]) for x in lst]) # Characters per user post y = [np.array([int(y) for y in x[2:]]) for x in lst] # Average characters per post of a user z = np.array([i.mean() for i in y if len(i) > 0]) keep_users = set() limi = np.quantile(x, .50) limk = np.quantile(z, .50) for user, i, k in zip(users, x, z): if i > limi or k > limk: keep_users.add(user) print('[+] Extracted all the user participations.') return keep_users
import numpy as np from common_utils import read_csv_list import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns print("Loading file data.") lst = read_csv_list('num_files/user_to_num.csv')[1:] print("Loaded file data.") x = np.array([int(x[1]) for x in lst]) y = [np.array([int(y) for y in x[2:]]) for x in lst] print("Created axis") z = np.array([i.mean() for i in y if len(i) > 0]) xg = [np.quantile(x, i) for i in np.arange(0,1.01, 0.01)] zg = [np.quantile(z, i) for i in np.arange(0,1.01, 0.01)] print("XG", xg) print("ZG", zg) sns.set(style="whitegrid") #ax = sns.boxplot(x=x) ax = sns.distplot(x); #plt.savefig('your_figure.png') ax.figure.savefig("image.png")
def gen_latex_coincidences(do,specific_users=[]): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] file = open('analysis.tex', 'w+') header = """\\documentclass[12pt]{article} \\usepackage[utf8]{inputenc} \\usepackage[T1]{fontenc} \\usepackage[USenglish]{babel} \\usepackage{xcolor} % Colors \\usepackage{tabularx} % Other type of columns \\usepackage{caption} \\usepackage{hyperref} \\renewcommand{\\baselinestretch}{1.3} \\usepackage{minted} \\title{Found pairs of users} \\author{-} \\date{} \\begin{document} \\maketitle\n""" footer = """ \\end{document}""" file.write(header) # If not specific users are given, take the first 5. Otherwise, take those from the list if specific_users is None: pairs = lst_users[:5] else: pairs=[] for tuple_list in lst_users: (u1, u2, _, _, _, _)=tuple_list for specific_user in specific_users: if (u1==specific_user and u2 in specific_users) or (u2==specific_user and u1 in specific_users): pairs.append(tuple_list) for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) print("ID: %s" %(_id), len(dictio_of_users), len(user_ind)) for index, (u1, u2, _, _, _, _) in enumerate(pairs): #file1name = "tex/%d.tex"%(index) #file1 = open(file1name, 'w+') print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) file.write("\\section{%s(%s)-%s(%s)} \n" %(u1, uname1, u2, uname2)) #file.write("\\include{%s}\n" % (file1name)) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) #print("COINCIDENCES", coins) gen_latex_post_coincidences(coins, u1, u2, file) file.write(footer) file.close()
def get_dictio_from_file(): lst = read_csv_list("word_index.csv")[1:] return {x[0]:int(x[1]) for x in lst}
def gen_data(self): tic = time.time() #Create the path for storing the dictionaries user_ind_p = self.dir + 'user_ind.pkl' value_ind_p = self.dir + 'value_ind.pkl' dictio_of_users_p = self.dir + 'dictio_of_users.pkl' dictio_of_values_p = self.dir + 'dictio_of_values.pkl' dictio_of_usage_p = self.dir + 'dictio_of_usage.pkl' #Adding files to list for cleanup self.cleanup_list.append(user_ind_p), self.cleanup_list.append( value_ind_p), self.cleanup_list.append( dictio_of_users_p), self.cleanup_list.append( dictio_of_values_p), self.cleanup_list.append( dictio_of_usage_p) if self.backup and os.path.exists(user_ind_p) and os.path.exists( value_ind_p ) and os.path.exists(dictio_of_users_p) and os.path.exists( dictio_of_values_p) and os.path.exists(dictio_of_usage_p): self.pprint("Data Structures already exist, unpickling.", end='\r') user_ind = unpickle_object(user_ind_p) value_ind = unpickle_object(value_ind_p) dictio_of_users = unpickle_object(dictio_of_users_p) dictio_of_values = unpickle_object(dictio_of_values_p) # TODO Remove comment #dictio_of_usage = unpickle_object(dictio_of_usage_p) dictio_of_usage = None self.pprint("[END] Data Structures already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage lst = read_csv_list(self.data)[1:] tic = time.time() user_ind = {} value_ind = {} dictio_of_users = {} dictio_of_values = {} dictio_of_usage = {} total = len(lst) max_val = np.uint32(0) for uind, i in enumerate(lst): if uind % 1000 == 0: self.pprint("Data Structures Generation", "[%d Users Processed]" % (uind), "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(), get_elapsed_time(tic), end='\r') uind = np.uint32(uind) user_ind[i[0]] = uind user = i[0] dictio_of_users[uind] = [] dictio_of_usage[uind] = [] for t in i[1:]: value, usage = self.separate(t) usage = np.uint32(usage) if value not in value_ind: value_ind[value] = max_val dictio_of_values[max_val] = [] max_val += 1 vind = value_ind[value] dictio_of_values[vind].append(uind) dictio_of_users[uind].append(vind) dictio_of_usage[uind].append(usage) self.pprint("[END] Data Structures Generation", "[%d Users Processed]" % (uind), "[%0.3f Percentage]" % ((uind / total) * 100), get_ram(), get_elapsed_time(tic)) lst = None # Freeing space from list, no longer needed #self.pprint("[0/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(user_ind, user_ind_p) #self.pprint("[1/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(value_ind, value_ind_p) #self.pprint("[2/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_users, dictio_of_users_p) #self.pprint("[3/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_values, dictio_of_values_p) #self.pprint("[4/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) #pickle_object(dictio_of_usage, dictio_of_usage_p) #self.pprint("[END] [5/5] Storing data structures to disk", get_ram(), get_elapsed_time(tic)) return user_ind, value_ind, dictio_of_users, dictio_of_values, dictio_of_usage
def generate_word_dictionary(): print("[-] Generating dictionary") lst = read_csv_list("ind_users.csv")[1:] dictio = word_from_features(lst) store_dictio(dictio)
def clean_usernames(): lst = read_csv_list("similar_usernames_full.csv")[1:] lst2 = read_csv_list("all_posts_all_users.csv") lst = [x for x in lst if x[0] != ''] lst2 = [x for x in lst2 if x[0] != ''] total = 0 count = 0 dictio = {} results = [] threshold = 20 for i in lst2: #print (i[0:2], i[2]) dictio[i[0]] = {} for i in lst2: #print (i[0:2], i[2]) dictio[i[0]][i[1]] = int(i[2]) not_both = [x for x in lst if x[0] not in dictio.keys()] print(len(dictio), len(list(set([x[0] for x in lst]))), len(not_both), len(not_both)+ len(dictio)) for i in lst: boolean = True for j in i[1:]: if i[0] in dictio.keys() and dictio[i[0]][j] < threshold: boolean = False if boolean: #print (i) results += [i] else: del dictio[i[0]] for i in results: if (len (i) > 3): count += 1 if (len (i) > 4): total += 1 print ("At least two: %d" % (len(dictio)), "At least three: %d" % (count) , "At least four: %d" % (total)) conn = psycopg2.connect(database="crimebb", user=db_username, password=db_password, host="127.0.0.1", port="5432") print("Database Connected....") rows_processed = [] status.create_numbar(100, len(dictio)) for indi, i in enumerate(dictio.keys()): #print (multiprocessing.current_process(), "%0.2f %%" % ( indi * 100 / len(lst))) status.update_numbar(indi, len(dictio)) for key in dictio[i].keys(): cur = conn.cursor() cur.execute("""SELECT "Post"."Content" from "Post" JOIN "Member" ON "Post"."Author" = "Member"."IdMember" WHERE ("Member"."Username" = %s) AND "Member"."Site" = %s;""", (i, int(key))) rows = [row[0] for row in cur.fetchall()] #print (rows[0]) tfidf = tf_idf(rows) tfidf = sorted(tfidf, key=lambda x: x[-1], reverse=True) #print(i, key, tfidf[:3]) tfidf = [i for j in tfidf[:50] for i in j] dictio[i][key] = tuple(tfidf) #print (i[0], j, count) if indi == 100: k = list(dictio.keys()) rows_processed = [(user, forum) + dictio[user][forum] for user in k[:100] for forum in dictio[user].keys()] gen_csv_from_tuples("tfidf_prov.csv", [""], rows_processed) status.end_numbar() rows_processed = [(user, forum) + dictio[user][forum] for user in dictio.keys() for forum in dictio[user].keys()] gen_csv_from_tuples("tfidf.csv", [""], rows_processed) conn.close()
def get_ind_features(): lst = read_csv_list("ind_users.csv")[1:] return {x[0]:x[1:] for x in lst}
def gen_new_dataset(): lst = read_csv_list("combination.csv") return lst
def generate_graph_pickle(): lst = read_csv_list("weighted_average.csv")[1:]
def get_gen_fetures(): lst = read_csv_list("gen_users.csv")[1:] return {x[0]+"-"+x[1]:x[2:] for x in lst}