Example #1
0
def do_extract_trigrams(members):
    # This function extracts the trigrams from users. For each user we extract all the trigrams and then we add them to the dictionary of trigrams.
    # The dictionary of trigrams counts all the trigrams that exist and with each new appearance it adds information of how many times it appeared.

    dict_trigrams = dict()
    tic = time.time()
    dirname = 'trigram_files/'
    create_dir(dirname)
    for i, member in enumerate(members):
        if i % 1000 == 0:
            print("[%d Users Processed]" % (i),
                  "[%d Trigrams]" % (len(dict_trigrams)),
                  "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
                  get_elapsed_time(tic))
            if i % 10000 == 0:
                print("Saving data...")
                gen_csv_from_tuples(dirname + "trigrams.csv",
                                    ["trigram", "usage"],
                                    [(k, v) for k, v in dict_trigrams.items()])

        for elem in get_trigrams_from_member(member):
            if elem in dict_trigrams:
                dict_trigrams[elem] += 1
            else:
                dict_trigrams[elem] = 1
    print("Saving data...")
    gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"],
                        [(k, v) for k, v in dict_trigrams.items()])
    return dict_trigrams
Example #2
0
def do_extract_trigrams_mp(members):
    dict_trigrams = dict()
    tic = time.time()
    batch_size = 10000
    dirname = 'trigram_files/'
    create_dir(dirname)
    for i in range(0, len(members), batch_size):
        p = mp.Pool(12)
        list_trigrams = p.map(get_trigrams_from_member,
                              members[i:i + batch_size])
        print("Trigrams", "[%d Users Processed]" % (i),
              "[%d Trigrams]" % (len(dict_trigrams)),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))

        for l in list_trigrams:
            for elem in l:
                if elem in dict_trigrams:
                    dict_trigrams[elem] += 1
                else:
                    dict_trigrams[elem] = 1
        p.close()
    print("Saving data...")
    gen_csv_from_tuples(dirname + "trigrams.csv", ["trigram", "usage"],
                        [(k, v) for k, v in dict_trigrams.items()])
    return dict_trigrams
Example #3
0
def do_extract_users_trigrams_mp(members):
    dict_trigrams = dict()
    tic = time.time()
    batch_size = 20000
    dirname = 'trigram_files/'
    create_dir(dirname)

    # We assign an index to each element in the order they are stored in the file.
    # Additionally, we remove those trigrams written only by one user.
    #dict_trigrams = {x[0]:i for i, x in enumerate(read_csv_list(dirname+"trigrams.csv")[1:]) if int(x[1]) > 1}
    print(len(dict_trigrams))
    # We make the computations in batches of information so that we do not fill the memory.
    # Those batches are stored little by little.
    #partial_function = partial(do_extract_user_trigrams, dict_trigrams)
    for i in range(0, len(members), batch_size):
        print("Users Trigrams", "[%d Users Processed]" % (i),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))
        batch = members[i:i + batch_size]
        p = mp.Pool(16)
        users_trigrams = p.map(do_extract_user_trigrams, batch)
        p.close()
        # We only store lengths bigger than 1
        gen_csv_from_tuples(dirname + "user_to_trigram.csv_%d" % (i),
                            ["IdAuthor", "IdTrigrams"],
                            [x for x in users_trigrams if len(x) > 1])

    #fs = FeatureScore(None, None, None, None, None, None, None,None, None,None, None)
    join_all_results(dirname + "user_to_trigram.csv")
    return dict_trigrams
Example #4
0
def gen_coincidences(do):
	directory = "multfs_users/"
	lst_users = read_csv_list("multfs.csv")[1:]
	create_dir(directory)
	dictios_of_users = []
	value_inds = []
	user_inds = []
	
	pairs = lst_users[:5]

	for _id in do:
		dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl")
		dictios_of_users.append(dictio_of_users)
		print(len(dictio_of_users))
		value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl")
		value_inds.append(value_ind)
		user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl")
		user_inds.append(user_ind)

	for index, (u1, u2, _, _, _, _) in enumerate(pairs):
		print("Going for %d" % (index), u1, u2)
		uname1, rg1, lp1 = get_username(u1)
		uname2, rg2, lp2 = get_username(u2)
		directory2 = directory + "%s(%s)-%s(%s)/" %(u1, uname1, u2, uname2)
		create_dir(directory2)
		coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds)
		print(coins)
		gen_post_coincidences(coins, u1, u2, directory2)
Example #5
0
 def run(self, context):
     self._init_script()
     template_folder = "sample"
     template_dir = os.path.dirname(
         os.path.abspath(__file__)) + "/" + template_folder
     jinja_env = Environment(loader=FileSystemLoader(template_dir),
                             trim_blocks=True,
                             autoescape=True)
     context["jinja_env"] = jinja_env
     context["output_folder"] = output_folder = "generated"
     create_dir(output_folder, delete_existing=False)
Example #6
0
def create_directories_and_datasets():
    create_dir('email_files')
    extract_user_to_email_csv("email_files/user_to_email.csv")
    create_dir('btc_files')
    extract_user_to_btc_csv("btc_files/user_to_btc.csv")
    create_dir('ip_files')
    extract_user_to_ip_csv("ip_files/user_to_ip.csv")
    create_dir('skype_files')
    extract_user_to_skype_csv("skype_files/user_to_skype.csv")
    create_dir('link_files')
    extract_user_to_link_csv("link_files/user_to_link.csv")
    extract_user_trigrams()
Example #7
0
def generate_directories_for_users():
    print("[>] Creating dir")
    create_dir("Author/")
    print("[>] Reading user csv list")
    lst_users = read_csv_list("weighted_average.csv")[1:]

    #lst_users = [(x[0], x[1] for x in lst_users if float(x[2]) < 0.35)
    ev_set = set()
    for entry in lst_users:
        if float(entry[2]) >= 0.35:
            break
        ev_set.add(entry[0])
        ev_set.add(entry[1])
    #status.create_numbar(100, len(ev_set))
    for ind, user in enumerate(ev_set):
        #status.update_numbar(ind, len(ev_set))
        generate_user_dataset(user, ind, len(ev_set))
Example #8
0
def extract_chars_per_user(members):
    dirname = 'num_files/'
    filename = 'user_to_num.csv'
    tic = time.time()
    batch_size = 24000
    create_dir(dirname)
    for i in range(0, len(members), batch_size):
        p = mp.Pool(16)
        lst_pr = p.map(extract_metrics_for_user, members[i:i + batch_size])
        print("Chars per User", "[%d Users Processed]" % (i),
              "[%0.3f Percentage]" % ((i / len(members)) * 100), get_ram(),
              get_elapsed_time(tic))
        p.close()
        # We only store lengths bigger than 1
        gen_csv_from_tuples(dirname + filename + "_%d" % (i),
                            ["IdAuthor", "IdTrigrams"], lst_pr)

    join_all_results(dirname + filename)
Example #9
0
def do_extract_timestamps(members):
    num_users = len(members)
    tic = time.time()
    print("Extracted %d users" % (num_users))
    dirname = 'timestamp_files/'
    create_dir(dirname)
    divs = 100000
    i = 0
    total_size = 0
    while total_size < num_users:
        print("User Timestamps", "[%d Users Processed]" % (total_size),
              "[%0.3f Percentage]" % ((total_size / len(members)) * 100),
              get_ram(), get_elapsed_time(tic))
        extract_timestamps(members[total_size:total_size + divs],
                           dirname + "user_to_timestamp.csv_%d" % (i))
        i += 1
        total_size += divs
        print(i, total_size, time.time() - tic)
    join_all_results(dirname + "user_to_timestamp.csv")
Example #10
0
def generate_user_dataset(user, uind, total):
    print("[-] Going for user %d/%d - %s" % (uind, total, user))
    tic = time.time()
    user_id, site_id = get_user_site(user)
    print("[- -] Extracting from DB")
    query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (
        user_id, site_id)
    rows = make_query(query)
    rows = [(row[0], row[1]) for row in rows]
    print("[+ +] Done extracting from DB")
    #a = string.ascii_lowercase
    #b = math.ceil(float(len(rows)) / float(len(a)) )
    #names = ["".join(elem) for iter in [it.product(a, repeat=i) for  i in range(1,b + 1)] for elem in iter]
    directory = 'Author/' + user + "/"
    create_dir(directory)
    print("[- -] Generating files for user, total: %d" % (len(rows)))
    for ind, content in enumerate(rows):
        filename = str(user_id) + "-" + str(site_id) + "-" + str(content[0])
        with open(directory + filename + ".txt", 'w+') as file:
            file.write(content[1])
    print("[+ +] Generating files for user, total: %d" % (len(rows)))
    print("[+] Going for user %d - %s" % (uind, user))
Example #11
0
def create_directories_and_datasets_2():
    create_dir('link_files')
    extract_user_to_link_csv("link_files/user_to_link.csv")
Example #12
0
def gen_latex_coincidences(do,specific_users=[]):
	directory = "multfs_users/"
	lst_users = read_csv_list("multfs.csv")[1:]
	create_dir(directory)
	dictios_of_users = []
	value_inds = []
	user_inds = []
	file = open('analysis.tex', 'w+')
	header = """\\documentclass[12pt]{article}

\\usepackage[utf8]{inputenc}
\\usepackage[T1]{fontenc}
\\usepackage[USenglish]{babel}

\\usepackage{xcolor} % Colors
\\usepackage{tabularx} % Other type of columns
\\usepackage{caption} 
\\usepackage{hyperref}
\\renewcommand{\\baselinestretch}{1.3}


\\usepackage{minted}

\\title{Found pairs of users}
\\author{-}
\\date{}
\\begin{document}

\\maketitle\n"""

	footer = """
\\end{document}"""

	file.write(header)
	# If not specific users are given, take the first 5. Otherwise, take those from the list
	if specific_users is None:
		pairs = lst_users[:5]
	else:
		pairs=[]
		for tuple_list in lst_users:
			(u1, u2, _, _, _, _)=tuple_list
			for specific_user in specific_users:
				if (u1==specific_user and u2 in specific_users) or (u2==specific_user and u1 in specific_users):
					pairs.append(tuple_list)

	for _id in do:

		dictio_of_users = unpickle_object(_id + "_files/clean_dictio_of_users.pkl")
		dictios_of_users.append(dictio_of_users)
		
		value_ind = unpickle_object(_id + "_files/clean_value_ind.pkl")
		value_inds.append(value_ind)
		user_ind = unpickle_object(_id + "_files/clean_user_ind.pkl")
		user_inds.append(user_ind)
		print("ID: %s" %(_id), len(dictio_of_users), len(user_ind))

	for index, (u1, u2, _, _, _, _) in enumerate(pairs):
		#file1name = "tex/%d.tex"%(index)
		#file1 = open(file1name, 'w+')
		print("Going for %d" % (index), u1, u2)
		uname1, rg1, lp1 = get_username(u1) 
		uname2, rg2, lp2 = get_username(u2)
		file.write("\\section{%s(%s)-%s(%s)} \n" %(u1, uname1, u2, uname2))
		#file.write("\\include{%s}\n" % (file1name))
		coins = get_coincidences_for_pair(u1, u2, dictios_of_users, user_inds, value_inds)
		#print("COINCIDENCES", coins)
		gen_latex_post_coincidences(coins, u1, u2, file)

	file.write(footer)
	file.close()