Ejemplo n.º 1
0
def get_num_threads(user, site):
	if DEBUG:
		print("[-] Getting num_threads")
	#Posts created by user
	query1 = """SELECT count("IdThread") FROM "Thread" WHERE "Thread"."Author" = %d AND "Thread"."Site" = %d;""" % (user, site)
	#Posts he participated in
	query2 = """WITH A AS 
	(SELECT "Post"."Thread" FROM "Post" 
	WHERE "Post"."Author" = %d AND "Post"."Site" = %d 
	GROUP BY "Post"."Thread") 
	SELECT count(A."Thread") FROM A;""" % (user, site)
	rows1 = make_query(query1)
	rows2 = make_query(query2)
	#print(rows1, rows2)
	return rows1[0][0], rows2[0][0]
Ejemplo n.º 2
0
def extract_user_to_btc_csv(filename):
    tic = time.time()
    print("[+] Generating Bitcoin dataset")
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '\y([13][a-km-zA-HJ-NP-Z1-9]{25,34})\y', 'g') AS "btc"
  		FROM "Post" WHERE "Content" ~ '([13][a-km-zA-HJ-NP-Z1-9]{25,34})'),
		"B" AS (SELECT "Author", "btc"[1] as "btc", count(*) as "repetitions" FROM "A" GROUP BY "Author", "btc" )
		SELECT "B"."Author",
		string_agg(CAST("B"."btc" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""

    rows = make_query(query)
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != str(-1)
    ]
    for row in range(len(rows)):
        for col in range(1, len(rows[row])):
            if rows[row][col][-1] == '.':
                rows[row][col] = rows[row][col][:-1]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))
    gen_csv_from_tuples(filename, ["IdAuthor", "btc"], rows)
    print("[+] Finished generating Bitcoin dataset in %d seconds" %
          (time.time() - tic))
Ejemplo n.º 3
0
def extract_user_to_ip_csv():
    query = """WITH "A" AS (SELECT
		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", 
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" )
		SELECT "B"."Author",
		string_agg(CAST("B"."repetitions" AS text) || ':' || "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    query = """WITH "A" AS (SELECT
		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" )
		SELECT "B"."Author",
		string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    #print(rows)
    print(len(rows))
    rows = [
        row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows
        if row[0] != -1
    ]
    print(len(rows))
    gen_csv_from_tuples("ip_files/user_to_ip.csv", ["IdAuthor", "IP"], rows)
Ejemplo n.º 4
0
def extract_user_to_email_csv(filename):
    tic = time.time()
    print("[+] Generating Email dataset")
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email"
  		FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'),
		"B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" )
		SELECT "B"."Author",
		string_agg(CAST("B"."email" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != str(-1)
    ]
    for row in range(len(rows)):
        for col in range(1, len(rows[row])):
            if len(rows[row][col]) > len("***LINK***") and rows[row][
                    col][:len("***LINK***")] == "***LINK***":
                rows[row][col] = rows[row][col][len("***LINK***"):]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))
    gen_csv_from_tuples(filename, ["IdAuthor", "email"], rows)
    print("[+] Finished generating Email dataset in %d seconds" %
          (time.time() - tic))
Ejemplo n.º 5
0
def extract_user_to_ip_csv(filename):
    tic = time.time()
    print("[+] Generating IP Address dataset")
    query = """WITH "A" AS (SELECT
		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
		-- regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		regexp_matches( "Content", '([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.(25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" )
		SELECT "B"."Author",
		string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows
        if row[0] != str(-1)
    ]
    cleaned_rows = []
    for r in rows:
        cleaned = [r[0]]
        for ip in r[1:]:
            if check_ip(ip.split('[')[0]):
                cleaned.append(ip)
        cleaned_rows.append(tuple(cleaned))

    gen_csv_from_tuples(filename, ["IdAuthor", "IP"], cleaned_rows)
    print("[+] Finished generating IP Address dataset in %d seconds" %
          (time.time() - tic))
Ejemplo n.º 6
0
def extract_user_to_skype_csv():
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", 
  		regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype"
  		FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'),
		"B" AS (SELECT "Author", lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "Author", "skype" )
		SELECT "B"."Author",
		string_agg("B"."skype", ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != -1
    ]
    #print(rows)
    for row in range(len(rows)):
        #print(type(rows[row]), len(rows[row]))
        for col in range(1, len(rows[row])):
            #print(row, col, rows[row][col])
            if rows[row][col][-1] == '.':
                #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
                rows[row][col] = rows[row][col][:-1]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))
    print(len(rows))

    print(len(rows))
    gen_csv_from_tuples("skype_files/user_to_skype.csv", ["IdAuthor", "skype"],
                        rows)
Ejemplo n.º 7
0
def extract_user_to_skype_csv(filename):
    tic = time.time()
    print("[+] Generating Skype dataset")
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", 
  		regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype"
  		FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'),
		"B" AS (SELECT "Author", lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "Author", "skype" )
		SELECT "B"."Author",
		string_agg(CAST("B"."skype" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != str(-1)
    ]
    for row in range(len(rows)):
        for col in range(1, len(rows[row])):
            if rows[row][col][-1] == '.':
                rows[row][col] = rows[row][col][:-1]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))

    gen_csv_from_tuples(filename, ["IdAuthor", "skype"], rows)
    print("[+] Finished generating Skype dataset in %d seconds" %
          (time.time() - tic))
Ejemplo n.º 8
0
def extract_user_to_link_csv():
	query= """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link"
  		FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'),
		"B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" )
		SELECT "B"."Author",
		string_agg("B"."link", ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""	
	rows = make_query(query)
	#rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
	rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
	# #print(rows)
	# for row in range(len(rows)):
	# 	#print(type(rows[row]), len(rows[row]))
	# 	for col in range(1, len(rows[row])):
	# 		#print(row, col, rows[row][col])
	# 		if rows[row][col][-1] == '.':
	# 			#print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
	# 			print("CHANGED")
	# 			rows[row][col] = rows[row][col][:-1]
				

	# for row in range(len(rows)):
	# 	rows[row] = (rows[row][0],) + tuple(set(rows[row][1:]))
	print (len(rows))
	gen_csv_from_tuples("link_files/user_to_link3.csv", ["IdAuthor", "link"], rows)
Ejemplo n.º 9
0
def extract_user_to_email_csv():
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email"
  		FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'),
		"B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" )
		SELECT "B"."Author",
		string_agg("B"."email", ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != -1
    ]
    #print(rows)
    for row in range(len(rows)):
        #print(type(rows[row]), len(rows[row]))
        for col in range(1, len(rows[row])):
            #print(row, col, rows[row][col])
            if len(rows[row][col]) > len("***LINK***") and rows[row][
                    col][:len("***LINK***")] == "***LINK***":
                #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
                rows[row][col] = rows[row][col][len("***LINK***"):]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))
    #print(rows)

    #print(rows)
    print(len(rows))

    print(len(rows))
    gen_csv_from_tuples("email_files/user_to_email.csv", ["IdAuthor", "email"],
                        rows)
Ejemplo n.º 10
0
def extract_more_features():

    # Get all user different users
    query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;"""
    members = make_query(query)
    members = [(int(x[0]), int(x[1])) for x in members]

    do_extract_timestamps(members)
Ejemplo n.º 11
0
def get_post_content(user, site):
	if DEBUG:
		print("[-] Getting post content %d, %d" % (user, site))
	query = """SELECT "Post"."Content" from "Post" 
					WHERE "Post"."Author" = %d AND "Post"."Site" = %d;""" % (user, site)
	rows = make_query(query)
	rows = [tup[0] for tup in rows]
	return rows
Ejemplo n.º 12
0
def get_user_posts(user):
	pos_bracket = user.find("[")
	user_id = int(user[:pos_bracket])
	site_id = int(user[pos_bracket + 1:-1])
	#print("[- -] Extracting from DB")
	query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (user_id, site_id)
	rows = make_query(query)
	rows = [(row[0], row[1]) for row in rows]
	return rows
Ejemplo n.º 13
0
def get_user_posts(user, connector=None):
    user_id = user[0]
    site_id = user[1]
    #print("[- -] Extracting from DB")
    #query = """SELECT "IdPost", "Content", "Timestamp" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (user_id, site_id)
    query = """SELECT  "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (
        user_id, site_id)
    rows = make_query(query, conn=connector)
    rows = [row[0] for row in rows]
    return rows
Ejemplo n.º 14
0
def extract_link_to_usage():
	query= """WITH "A" AS (SELECT
  		regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link"
  		FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'),
		"B" AS (SELECT lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "link" )
		SELECT "link", "repetitions" FROM "B";"""
	rows = make_query(query)

	print (len(rows))
	gen_csv_from_tuples("link_files/link_count.csv", ["link", "Reps"], rows)	
Ejemplo n.º 15
0
def get_username(user):
	pos_bracket = user.find("[")
	user_id = int(user[:pos_bracket])
	site_id = int(user[pos_bracket + 1:-1])
	#print("[- -] Extracting from DB")
	query = """SELECT "Username", "RegistrationDate", "LastPostDate" FROM "Member" WHERE "IdMember" = %d AND "Site" = %d;""" % (user_id, site_id)
	rows = make_query(query)
	username = rows[0][0]
	regdate = rows[0][1]
	lastpostdate = rows[0][2]
	return username, regdate, lastpostdate
Ejemplo n.º 16
0
def extract_ip_to_usage():
    query = """WITH "A" AS (SELECT
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "ip", count(*) as "repetitions" FROM "A" GROUP BY "ip" )
		SELECT "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] as "ip", "B"."repetitions" FROM "B";"""
    rows = make_query(query)
    #print(rows)
    print(len(rows))
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    print(len(rows))
    gen_csv_from_tuples("ip_files/ip_count.csv", ["IP", "Reps"], rows)
Ejemplo n.º 17
0
def extract_skype_to_usage():
    query = """WITH "A" AS (SELECT
  		regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype"
  		FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'),
		"B" AS (SELECT lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "skype" )
		SELECT "skype", "repetitions" FROM "B";"""
    rows = make_query(query)
    rows = [list(row) for row in rows]
    for i in range(len(rows)):
        if rows[i][0][-1] == '.':
            rows[i][0] = rows[i][0][:-1]
    rows = [tuple(row) for row in rows]
    #print(rows)
    print(len(rows))
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    print(len(rows))
    gen_csv_from_tuples("skype_files/skype_count.csv", ["skype", "Reps"], rows)
Ejemplo n.º 18
0
def extract_btc_to_usage():
    query = """WITH "A" AS (SELECT
		regexp_matches( "Content", '([13][a-km-zA-HJ-NP-Z1-9]{25,34})', 'g') AS "btc"
  		FROM "Post" WHERE "Content" ~ '([13][a-km-zA-HJ-NP-Z1-9]{25,34})'),
		"B" AS (SELECT lower("btc"[1]) as "btc", count(*) as "repetitions" FROM "A" GROUP BY "btc" )
		SELECT "btc", "repetitions" FROM "B";"""
    rows = make_query(query)
    rows = [list(row) for row in rows]
    for i in range(len(rows)):
        if rows[i][0][-1] == '.':
            rows[i][0] = rows[i][0][:-1]
    rows = [tuple(row) for row in rows]
    #print(rows)
    print(len(rows))
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    print(len(rows))
    gen_csv_from_tuples("btc_files/btc_count.csv", ["btc", "Reps"], rows)
Ejemplo n.º 19
0
def extract_email_to_usage():
    query = """WITH "A" AS (SELECT
  		regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email"
  		FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'),
		"B" AS (SELECT lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "email" )
		SELECT "email", "repetitions" FROM "B";"""
    rows = make_query(query)
    rows = [list(row) for row in rows]
    for i in range(len(rows)):
        if len(rows[i][0]) > len("***LINK***") and rows[i][
                0][:len("***LINK***")] == "***LINK***":
            #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
            rows[i][0] = rows[i][0][len("***LINK***"):]
    rows = [tuple(row) for row in rows]
    #print(rows)
    print(len(rows))
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    print(len(rows))
    gen_csv_from_tuples("email_files/email_count.csv", ["email", "Reps"], rows)
Ejemplo n.º 20
0
def extract_user_to_link_csv(filename):
    tic = time.time()
    print("[+] Generating Link dataset")
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link"
  		FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'),
		"B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" )
		SELECT "B"."Author",
		string_agg(LEFT(CAST("B"."link" AS text), 500) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows
        if row[0] != str(-1)
    ]
    gen_csv_from_tuples(filename, ["IdAuthor", "link"], rows)
    print("[+] Finished generating Link dataset in %d seconds" %
          (time.time() - tic))
Ejemplo n.º 21
0
def main():
    if len(sys.argv) < 2:
        print("""Usage: python3 dataset_generators.py <option>
	<option>:
		'datasets': generate datasets and directories
		'authorfolders': generate folders for users
		""")
        return

    if sys.argv[1] == 'authorfolders':
        print("[>>] Doing all")
        generate_directories_for_users()

    elif sys.argv[1] == 'datasets':
        print("[>>] NONE")
        create_directories_and_datasets()
        #extract_more_features()
    elif sys.argv[1] == 'morefeatures':
        print("[--] Extracting more features")
        extract_more_features()
        simplify()
    elif sys.argv[1] == 'simplify':
        simplify()
        print("Swapping files")
        files = [
            'trigram_files/user_to_trigram.csv',
            'timestamp_files/user_to_timestamp.csv'
        ]
        for i in files:
            swap_files(i, i + "_simple")
    elif sys.argv[1] == 'trigrams':
        # create_dir('link_files')
        # extract_user_to_link_csv("link_files/user_to_link.csv")
        extract_user_trigrams()
    elif sys.argv[1] == 'user_metrics':
        query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;"""
        members = make_query(query)
        members = [(int(x[0]), int(x[1])) for x in members]
        extract_chars_per_user(members)
Ejemplo n.º 22
0
def generate_user_dataset(user, uind, total):
    print("[-] Going for user %d/%d - %s" % (uind, total, user))
    tic = time.time()
    user_id, site_id = get_user_site(user)
    print("[- -] Extracting from DB")
    query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (
        user_id, site_id)
    rows = make_query(query)
    rows = [(row[0], row[1]) for row in rows]
    print("[+ +] Done extracting from DB")
    #a = string.ascii_lowercase
    #b = math.ceil(float(len(rows)) / float(len(a)) )
    #names = ["".join(elem) for iter in [it.product(a, repeat=i) for  i in range(1,b + 1)] for elem in iter]
    directory = 'Author/' + user + "/"
    create_dir(directory)
    print("[- -] Generating files for user, total: %d" % (len(rows)))
    for ind, content in enumerate(rows):
        filename = str(user_id) + "-" + str(site_id) + "-" + str(content[0])
        with open(directory + filename + ".txt", 'w+') as file:
            file.write(content[1])
    print("[+ +] Generating files for user, total: %d" % (len(rows)))
    print("[+] Going for user %d - %s" % (uind, user))
Ejemplo n.º 23
0
def get_user_timestamps(user):
    user_id = user[0]
    site_id = user[1]
    #print("[- -] Extracting from DB")
    query = """SELECT "Timestamp" FROM "Post" WHERE"Author" = %d AND "Site" = %d;""" % (
        user_id, site_id)
    rows = make_query(query)
    rows = [(row[0].weekday(), row[0].hour, row[0].minute) for row in rows]
    dic_result = dict()

    minute_intervals = 10
    residual_prob = 1 / minute_intervals
    total_intervals = 60 // 10
    # O(28)
    for i in range(7):
        for j in range(24):
            for z in range(total_intervals):
                dic_result[(i, j, z)] = 0.0
    # O(n)

    for row in rows:
        hour = row[1]
        minute = row[2] // minute_intervals

        dic_result[(row[0], hour, minute)] += 1
        dic_result[(row[0], hour,
                    (minute - 1) % total_intervals)] += residual_prob
        dic_result[(row[0], hour,
                    (minute + 1) % total_intervals)] += residual_prob
    # O(28)
    tup = (
        site_id,
        "%d[%d]" % (user_id, site_id),
    ) + tuple([
        "%s|%s|%s[%s]" % (k[0], k[1], k[2], v)
        for k, v in dic_result.items() if v > 0
    ])
    return tup
Ejemplo n.º 24
0
def get_username(user, site):
	query = """SELECT "Member"."Username" FROM "Member" WHERE "Member"."IdMember" = %d AND "Member"."Site" = %d;""" %(user, site)
	rows = make_query(query)
	return rows[0][0]
Ejemplo n.º 25
0
def extract_user_trigrams():
    query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;"""
    members = make_query(query)
    members = [(int(x[0]), int(x[1])) for x in members
               if not int(x[1]) in nonEnglishForums]
    do_extract_users_trigrams_mp(members)