def get_num_threads(user, site): if DEBUG: print("[-] Getting num_threads") #Posts created by user query1 = """SELECT count("IdThread") FROM "Thread" WHERE "Thread"."Author" = %d AND "Thread"."Site" = %d;""" % (user, site) #Posts he participated in query2 = """WITH A AS (SELECT "Post"."Thread" FROM "Post" WHERE "Post"."Author" = %d AND "Post"."Site" = %d GROUP BY "Post"."Thread") SELECT count(A."Thread") FROM A;""" % (user, site) rows1 = make_query(query1) rows2 = make_query(query2) #print(rows1, rows2) return rows1[0][0], rows2[0][0]
def extract_user_to_btc_csv(filename): tic = time.time() print("[+] Generating Bitcoin dataset") query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '\y([13][a-km-zA-HJ-NP-Z1-9]{25,34})\y', 'g') AS "btc" FROM "Post" WHERE "Content" ~ '([13][a-km-zA-HJ-NP-Z1-9]{25,34})'), "B" AS (SELECT "Author", "btc"[1] as "btc", count(*) as "repetitions" FROM "A" GROUP BY "Author", "btc" ) SELECT "B"."Author", string_agg(CAST("B"."btc" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != str(-1) ] for row in range(len(rows)): for col in range(1, len(rows[row])): if rows[row][col][-1] == '.': rows[row][col] = rows[row][col][:-1] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) gen_csv_from_tuples(filename, ["IdAuthor", "btc"], rows) print("[+] Finished generating Bitcoin dataset in %d seconds" % (time.time() - tic))
def extract_user_to_ip_csv(): query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" ) SELECT "B"."Author", string_agg(CAST("B"."repetitions" AS text) || ':' || "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" ) SELECT "B"."Author", string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) #print(rows) print(len(rows)) rows = [ row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows if row[0] != -1 ] print(len(rows)) gen_csv_from_tuples("ip_files/user_to_ip.csv", ["IdAuthor", "IP"], rows)
def extract_user_to_email_csv(filename): tic = time.time() print("[+] Generating Email dataset") query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email" FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'), "B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" ) SELECT "B"."Author", string_agg(CAST("B"."email" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != str(-1) ] for row in range(len(rows)): for col in range(1, len(rows[row])): if len(rows[row][col]) > len("***LINK***") and rows[row][ col][:len("***LINK***")] == "***LINK***": rows[row][col] = rows[row][col][len("***LINK***"):] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) gen_csv_from_tuples(filename, ["IdAuthor", "email"], rows) print("[+] Finished generating Email dataset in %d seconds" % (time.time() - tic))
def extract_user_to_ip_csv(filename): tic = time.time() print("[+] Generating IP Address dataset") query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", -- regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" regexp_matches( "Content", '([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.([0-9]+25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)\.(25[0-5]|[0-9]+2[0-4][0-9]|[0-9]+[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" ) SELECT "B"."Author", string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows if row[0] != str(-1) ] cleaned_rows = [] for r in rows: cleaned = [r[0]] for ip in r[1:]: if check_ip(ip.split('[')[0]): cleaned.append(ip) cleaned_rows.append(tuple(cleaned)) gen_csv_from_tuples(filename, ["IdAuthor", "IP"], cleaned_rows) print("[+] Finished generating IP Address dataset in %d seconds" % (time.time() - tic))
def extract_user_to_skype_csv(): query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype" FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'), "B" AS (SELECT "Author", lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "Author", "skype" ) SELECT "B"."Author", string_agg("B"."skype", ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != -1 ] #print(rows) for row in range(len(rows)): #print(type(rows[row]), len(rows[row])) for col in range(1, len(rows[row])): #print(row, col, rows[row][col]) if rows[row][col][-1] == '.': #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) rows[row][col] = rows[row][col][:-1] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) print(len(rows)) print(len(rows)) gen_csv_from_tuples("skype_files/user_to_skype.csv", ["IdAuthor", "skype"], rows)
def extract_user_to_skype_csv(filename): tic = time.time() print("[+] Generating Skype dataset") query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype" FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'), "B" AS (SELECT "Author", lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "Author", "skype" ) SELECT "B"."Author", string_agg(CAST("B"."skype" AS text) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != str(-1) ] for row in range(len(rows)): for col in range(1, len(rows[row])): if rows[row][col][-1] == '.': rows[row][col] = rows[row][col][:-1] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) gen_csv_from_tuples(filename, ["IdAuthor", "skype"], rows) print("[+] Finished generating Skype dataset in %d seconds" % (time.time() - tic))
def extract_user_to_link_csv(): query= """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link" FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'), "B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" ) SELECT "B"."Author", string_agg("B"."link", ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] # #print(rows) # for row in range(len(rows)): # #print(type(rows[row]), len(rows[row])) # for col in range(1, len(rows[row])): # #print(row, col, rows[row][col]) # if rows[row][col][-1] == '.': # #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) # print("CHANGED") # rows[row][col] = rows[row][col][:-1] # for row in range(len(rows)): # rows[row] = (rows[row][0],) + tuple(set(rows[row][1:])) print (len(rows)) gen_csv_from_tuples("link_files/user_to_link3.csv", ["IdAuthor", "link"], rows)
def extract_user_to_email_csv(): query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email" FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'), "B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" ) SELECT "B"."Author", string_agg("B"."email", ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != -1 ] #print(rows) for row in range(len(rows)): #print(type(rows[row]), len(rows[row])) for col in range(1, len(rows[row])): #print(row, col, rows[row][col]) if len(rows[row][col]) > len("***LINK***") and rows[row][ col][:len("***LINK***")] == "***LINK***": #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) rows[row][col] = rows[row][col][len("***LINK***"):] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) #print(rows) #print(rows) print(len(rows)) print(len(rows)) gen_csv_from_tuples("email_files/user_to_email.csv", ["IdAuthor", "email"], rows)
def extract_more_features(): # Get all user different users query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;""" members = make_query(query) members = [(int(x[0]), int(x[1])) for x in members] do_extract_timestamps(members)
def get_post_content(user, site): if DEBUG: print("[-] Getting post content %d, %d" % (user, site)) query = """SELECT "Post"."Content" from "Post" WHERE "Post"."Author" = %d AND "Post"."Site" = %d;""" % (user, site) rows = make_query(query) rows = [tup[0] for tup in rows] return rows
def get_user_posts(user): pos_bracket = user.find("[") user_id = int(user[:pos_bracket]) site_id = int(user[pos_bracket + 1:-1]) #print("[- -] Extracting from DB") query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (user_id, site_id) rows = make_query(query) rows = [(row[0], row[1]) for row in rows] return rows
def get_user_posts(user, connector=None): user_id = user[0] site_id = user[1] #print("[- -] Extracting from DB") #query = """SELECT "IdPost", "Content", "Timestamp" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % (user_id, site_id) query = """SELECT "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % ( user_id, site_id) rows = make_query(query, conn=connector) rows = [row[0] for row in rows] return rows
def extract_link_to_usage(): query= """WITH "A" AS (SELECT regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link" FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'), "B" AS (SELECT lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "link" ) SELECT "link", "repetitions" FROM "B";""" rows = make_query(query) print (len(rows)) gen_csv_from_tuples("link_files/link_count.csv", ["link", "Reps"], rows)
def get_username(user): pos_bracket = user.find("[") user_id = int(user[:pos_bracket]) site_id = int(user[pos_bracket + 1:-1]) #print("[- -] Extracting from DB") query = """SELECT "Username", "RegistrationDate", "LastPostDate" FROM "Member" WHERE "IdMember" = %d AND "Site" = %d;""" % (user_id, site_id) rows = make_query(query) username = rows[0][0] regdate = rows[0][1] lastpostdate = rows[0][2] return username, regdate, lastpostdate
def extract_ip_to_usage(): query = """WITH "A" AS (SELECT regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "ip", count(*) as "repetitions" FROM "A" GROUP BY "ip" ) SELECT "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] as "ip", "B"."repetitions" FROM "B";""" rows = make_query(query) #print(rows) print(len(rows)) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] print(len(rows)) gen_csv_from_tuples("ip_files/ip_count.csv", ["IP", "Reps"], rows)
def extract_skype_to_usage(): query = """WITH "A" AS (SELECT regexp_matches( "Content", 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})', 'g') AS "skype" FROM "Post" WHERE "Content" ~ 'skype\s*:\s*([a-zA-Z0-9:\.]{1,37})'), "B" AS (SELECT lower("skype"[1]) as "skype", count(*) as "repetitions" FROM "A" GROUP BY "skype" ) SELECT "skype", "repetitions" FROM "B";""" rows = make_query(query) rows = [list(row) for row in rows] for i in range(len(rows)): if rows[i][0][-1] == '.': rows[i][0] = rows[i][0][:-1] rows = [tuple(row) for row in rows] #print(rows) print(len(rows)) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] print(len(rows)) gen_csv_from_tuples("skype_files/skype_count.csv", ["skype", "Reps"], rows)
def extract_btc_to_usage(): query = """WITH "A" AS (SELECT regexp_matches( "Content", '([13][a-km-zA-HJ-NP-Z1-9]{25,34})', 'g') AS "btc" FROM "Post" WHERE "Content" ~ '([13][a-km-zA-HJ-NP-Z1-9]{25,34})'), "B" AS (SELECT lower("btc"[1]) as "btc", count(*) as "repetitions" FROM "A" GROUP BY "btc" ) SELECT "btc", "repetitions" FROM "B";""" rows = make_query(query) rows = [list(row) for row in rows] for i in range(len(rows)): if rows[i][0][-1] == '.': rows[i][0] = rows[i][0][:-1] rows = [tuple(row) for row in rows] #print(rows) print(len(rows)) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] print(len(rows)) gen_csv_from_tuples("btc_files/btc_count.csv", ["btc", "Reps"], rows)
def extract_email_to_usage(): query = """WITH "A" AS (SELECT regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email" FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'), "B" AS (SELECT lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "email" ) SELECT "email", "repetitions" FROM "B";""" rows = make_query(query) rows = [list(row) for row in rows] for i in range(len(rows)): if len(rows[i][0]) > len("***LINK***") and rows[i][ 0][:len("***LINK***")] == "***LINK***": #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) rows[i][0] = rows[i][0][len("***LINK***"):] rows = [tuple(row) for row in rows] #print(rows) print(len(rows)) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] print(len(rows)) gen_csv_from_tuples("email_files/email_count.csv", ["email", "Reps"], rows)
def extract_user_to_link_csv(filename): tic = time.time() print("[+] Generating Link dataset") query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link" FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'), "B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" ) SELECT "B"."Author", string_agg(LEFT(CAST("B"."link" AS text), 500) || '[' || CAST("B"."repetitions" AS text) || ']', ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows if row[0] != str(-1) ] gen_csv_from_tuples(filename, ["IdAuthor", "link"], rows) print("[+] Finished generating Link dataset in %d seconds" % (time.time() - tic))
def main(): if len(sys.argv) < 2: print("""Usage: python3 dataset_generators.py <option> <option>: 'datasets': generate datasets and directories 'authorfolders': generate folders for users """) return if sys.argv[1] == 'authorfolders': print("[>>] Doing all") generate_directories_for_users() elif sys.argv[1] == 'datasets': print("[>>] NONE") create_directories_and_datasets() #extract_more_features() elif sys.argv[1] == 'morefeatures': print("[--] Extracting more features") extract_more_features() simplify() elif sys.argv[1] == 'simplify': simplify() print("Swapping files") files = [ 'trigram_files/user_to_trigram.csv', 'timestamp_files/user_to_timestamp.csv' ] for i in files: swap_files(i, i + "_simple") elif sys.argv[1] == 'trigrams': # create_dir('link_files') # extract_user_to_link_csv("link_files/user_to_link.csv") extract_user_trigrams() elif sys.argv[1] == 'user_metrics': query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;""" members = make_query(query) members = [(int(x[0]), int(x[1])) for x in members] extract_chars_per_user(members)
def generate_user_dataset(user, uind, total): print("[-] Going for user %d/%d - %s" % (uind, total, user)) tic = time.time() user_id, site_id = get_user_site(user) print("[- -] Extracting from DB") query = """SELECT "IdPost", "Content" FROM "Post" WHERE "Author" = %d AND "Site" = %d;""" % ( user_id, site_id) rows = make_query(query) rows = [(row[0], row[1]) for row in rows] print("[+ +] Done extracting from DB") #a = string.ascii_lowercase #b = math.ceil(float(len(rows)) / float(len(a)) ) #names = ["".join(elem) for iter in [it.product(a, repeat=i) for i in range(1,b + 1)] for elem in iter] directory = 'Author/' + user + "/" create_dir(directory) print("[- -] Generating files for user, total: %d" % (len(rows))) for ind, content in enumerate(rows): filename = str(user_id) + "-" + str(site_id) + "-" + str(content[0]) with open(directory + filename + ".txt", 'w+') as file: file.write(content[1]) print("[+ +] Generating files for user, total: %d" % (len(rows))) print("[+] Going for user %d - %s" % (uind, user))
def get_user_timestamps(user): user_id = user[0] site_id = user[1] #print("[- -] Extracting from DB") query = """SELECT "Timestamp" FROM "Post" WHERE"Author" = %d AND "Site" = %d;""" % ( user_id, site_id) rows = make_query(query) rows = [(row[0].weekday(), row[0].hour, row[0].minute) for row in rows] dic_result = dict() minute_intervals = 10 residual_prob = 1 / minute_intervals total_intervals = 60 // 10 # O(28) for i in range(7): for j in range(24): for z in range(total_intervals): dic_result[(i, j, z)] = 0.0 # O(n) for row in rows: hour = row[1] minute = row[2] // minute_intervals dic_result[(row[0], hour, minute)] += 1 dic_result[(row[0], hour, (minute - 1) % total_intervals)] += residual_prob dic_result[(row[0], hour, (minute + 1) % total_intervals)] += residual_prob # O(28) tup = ( site_id, "%d[%d]" % (user_id, site_id), ) + tuple([ "%s|%s|%s[%s]" % (k[0], k[1], k[2], v) for k, v in dic_result.items() if v > 0 ]) return tup
def get_username(user, site): query = """SELECT "Member"."Username" FROM "Member" WHERE "Member"."IdMember" = %d AND "Member"."Site" = %d;""" %(user, site) rows = make_query(query) return rows[0][0]
def extract_user_trigrams(): query = """ SELECT "IdMember", "Site" FROM "Member" WHERE "IdMember" != -1;""" members = make_query(query) members = [(int(x[0]), int(x[1])) for x in members if not int(x[1]) in nonEnglishForums] do_extract_users_trigrams_mp(members)