def do_extract_trigrams(members): # This function extracts the trigrams from users. For each user we extract all the trigrams and then we add them to the dictionary of trigrams. # The dictionary of trigrams counts all the trigrams that exist and with each new appearance it adds information of how many times it appeared. dict_trigrams = dict() tic = time.time() dirname = 'trigram_files/' create_dir(dirname) for i, member in enumerate(members): if i % 1000 == 0: print("[%d Users Processed]" % (i), "[%d Trigrams]" % (len(dict_trigrams)), "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(), get_elapsed_time(tic)) if i % 10000 == 0: print("Saving data...") gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"], [(k, v) for k, v in dict_trigrams.items()]) for elem in get_trigrams_from_member(member): if elem in dict_trigrams: dict_trigrams[elem] += 1 else: dict_trigrams[elem] = 1 print("Saving data...") gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"], [(k, v) for k, v in dict_trigrams.items()]) return dict_trigrams
def do_extract_trigrams_mp(members): dict_trigrams = dict() tic = time.time() batch_size = 10000 dirname = 'trigram_files/' create_dir(dirname) for i in range(0, len(members), batch_size): p = mp.Pool(12) list_trigrams = p.map(get_trigrams_from_member, members[i:i + batch_size]) print("Trigrams", "[%d Users Processed]" % (i), "[%d Trigrams]" % (len(dict_trigrams)), "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(), get_elapsed_time(tic)) for l in list_trigrams: for elem in l: if elem in dict_trigrams: dict_trigrams[elem] += 1 else: dict_trigrams[elem] = 1 p.close() print("Saving data...") gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"], [(k, v) for k, v in dict_trigrams.items()]) return dict_trigrams
def do_extract_users_trigrams_mp(members): dict_trigrams = dict() tic = time.time() batch_size = 20000 dirname = 'trigram_files/' create_dir(dirname) # We assign an index to each element in the order they are stored in the file. # Additionally, we remove those trigrams written only by one user. #dict_trigrams = {x[0]:i for i, x in enumerate(read_csv_list(dirname+"trigrams.csv")[1:]) if int(x[1]) > 1} print(len(dict_trigrams)) # We make the computations in batches of information so that we do not fill the memory. # Those batches are stored little by little. #partial_function = partial(do_extract_user_trigrams, dict_trigrams) for i in range(0, len(members), batch_size): print("Users Trigrams", "[%d Users Processed]" % (i), "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(), get_elapsed_time(tic)) batch = members[i:i + batch_size] p = mp.Pool(16) users_trigrams = p.map(do_extract_user_trigrams, batch) p.close() # We only store lengths bigger than 1 gen_csv_from_tuples(dirname + "user_to_trigram.csv_%d" % (i), ["IdAuthor", "IdTrigrams"], [x for x in users_trigrams if len(x) > 1]) #fs = FeatureScore(None, None, None, None, None, None, None,None, None,None, None) join_all_results(dirname + "user_to_trigram.csv") return dict_trigrams
def gen_coincidences(do): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] pairs = lst_users[:5] for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) print(len(dictio_of_users)) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) for index, (u1, u2, _, _, _, _) in enumerate(pairs): print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) directory2 = directory + "%s(%s)-%s(%s)/" %(u1, uname1, u2, uname2) create_dir(directory2) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) print(coins) gen_post_coincidences(coins, u1, u2, directory2)
def run(self, context): self._init_script() template_folder = "sample" template_dir = os.path.dirname( os.path.abspath(__file__)) + "/" + template_folder jinja_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True, autoescape=True) context["jinja_env"] = jinja_env context["output_folder"] = output_folder = "generated" create_dir(output_folder, delete_existing=False)
def create_directories_and_datasets(): create_dir('email_files') extract_user_to_email_csv("email_files/user_to_email.csv") create_dir('btc_files') extract_user_to_btc_csv("btc_files/user_to_btc.csv") create_dir('ip_files') extract_user_to_ip_csv("ip_files/user_to_ip.csv") create_dir('skype_files') extract_user_to_skype_csv("skype_files/user_to_skype.csv") create_dir('link_files') extract_user_to_link_csv("link_files/user_to_link.csv") extract_user_trigrams()
def generate_directories_for_users(): print("[>] Creating dir") create_dir("Author/") print("[>] Reading user csv list") lst_users = read_csv_list("weighted_average.csv")[1:] #lst_users = [(x[0], x[1] for x in lst_users if float(x[2]) < 0.35) ev_set = set() for entry in lst_users: if float(entry[2]) >= 0.35: break ev_set.add(entry[0]) ev_set.add(entry[1]) #status.create_numbar(100, len(ev_set)) for ind, user in enumerate(ev_set): #status.update_numbar(ind, len(ev_set)) generate_user_dataset(user, ind, len(ev_set))
def extract_chars_per_user(members): dirname = 'num_files/' filename = 'user_to_num.csv' tic = time.time() batch_size = 24000 create_dir(dirname) for i in range(0, len(members), batch_size): p = mp.Pool(16) lst_pr = p.map(extract_metrics_for_user, members[i:i + batch_size]) print("Chars per User", "[%d Users Processed]" % (i), "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(), get_elapsed_time(tic)) p.close() # We only store lengths bigger than 1 gen_csv_from_tuples(dirname + filename + "_%d" % (i), ["IdAuthor", "IdTrigrams"], lst_pr) join_all_results(dirname + filename)
def do_extract_timestamps(members): num_users = len(members) tic = time.time() print("Extracted %d users" % (num_users)) dirname = 'timestamp_files/' create_dir(dirname) divs = 100000 i = 0 total_size = 0 while total_size < num_users: print("User Timestamps", "[%d Users Processed]" % (total_size), "[%0.3f Percentage]" % ((total_size / len(members)) * 100), get_ram(), get_elapsed_time(tic)) extract_timestamps(members[total_size:total_size + divs], dirname + "user_to_timestamp.csv_%d" % (i)) i += 1 total_size += divs print(i, total_size, time.time() - tic) join_all_results(dirname + "user_to_timestamp.csv")
def generate_user_dataset(user, uind, total): print("[-] Going for user %d/%d - %s" % (uind, total, user)) tic = time.time() user_id, site_id = get_user_site(user) print("[- -] Extracting from DB") query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % ( user_id, site_id) rows = make_query(query) rows = [(row[0], row[1]) for row in rows] print("[+ +] Done extracting from DB") #a = string.ascii_lowercase #b = math.ceil(float(len(rows)) / float(len(a)) ) #names = ["".join(elem) for iter in [it.product(a, repeat=i) for i in range(1,b + 1)] for elem in iter] directory = 'Author/' + user + "/" create_dir(directory) print("[- -] Generating files for user, total: %d" % (len(rows))) for ind, content in enumerate(rows): filename = str(user_id) + "-" + str(site_id) + "-" + str(content[0]) with open(directory + filename + ".txt", 'w+') as file: file.write(content[1]) print("[+ +] Generating files for user, total: %d" % (len(rows))) print("[+] Going for user %d - %s" % (uind, user))
def create_directories_and_datasets_2(): create_dir('link_files') extract_user_to_link_csv("link_files/user_to_link.csv")
def gen_latex_coincidences(do,specific_users=[]): directory = "multfs_users/" lst_users = read_csv_list("multfs.csv")[1:] create_dir(directory) dictios_of_users = [] value_inds = [] user_inds = [] file = open('analysis.tex', 'w+') header = """\\documentclass[12pt]{article} \\usepackage[utf8]{inputenc} \\usepackage[T1]{fontenc} \\usepackage[USenglish]{babel} \\usepackage{xcolor} % Colors \\usepackage{tabularx} % Other type of columns \\usepackage{caption} \\usepackage{hyperref} \\renewcommand{\\baselinestretch}{1.3} \\usepackage{minted} \\title{Found pairs of users} \\author{-} \\date{} \\begin{document} \\maketitle\n""" footer = """ \\end{document}""" file.write(header) # If not specific users are given, take the first 5. Otherwise, take those from the list if specific_users is None: pairs = lst_users[:5] else: pairs=[] for tuple_list in lst_users: (u1, u2, _, _, _, _)=tuple_list for specific_user in specific_users: if (u1==specific_user and u2 in specific_users) or (u2==specific_user and u1 in specific_users): pairs.append(tuple_list) for _id in do: dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl") dictios_of_users.append(dictio_of_users) value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl") value_inds.append(value_ind) user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl") user_inds.append(user_ind) print("ID: %s" %(_id), len(dictio_of_users), len(user_ind)) for index, (u1, u2, _, _, _, _) in enumerate(pairs): #file1name = "tex/%d.tex"%(index) #file1 = open(file1name, 'w+') print("Going for %d" % (index), u1, u2) uname1, rg1, lp1 = get_username(u1) uname2, rg2, lp2 = get_username(u2) file.write("\\section{%s(%s)-%s(%s)} \n" %(u1, uname1, u2, uname2)) #file.write("\\include{%s}\n" % (file1name)) coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds) #print("COINCIDENCES", coins) gen_latex_post_coincidences(coins, u1, u2, file) file.write(footer) file.close()