Exemple #1
0
def extract_user_to_link_csv():
	query= """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link"
  		FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'),
		"B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" )
		SELECT "B"."Author",
		string_agg("B"."link", ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""	
	rows = make_query(query)
	#rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
	rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
	# #print(rows)
	# for row in range(len(rows)):
	# 	#print(type(rows[row]), len(rows[row]))
	# 	for col in range(1, len(rows[row])):
	# 		#print(row, col, rows[row][col])
	# 		if rows[row][col][-1] == '.':
	# 			#print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
	# 			print("CHANGED")
	# 			rows[row][col] = rows[row][col][:-1]
				

	# for row in range(len(rows)):
	# 	rows[row] = (rows[row][0],) + tuple(set(rows[row][1:]))
	print (len(rows))
	gen_csv_from_tuples("link_files/user_to_link3.csv", ["IdAuthor", "link"], rows)
Exemple #2
0
def extract_user_to_email_csv():
    query = """WITH "A" AS (SELECT
  		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
  		regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email"
  		FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'),
		"B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" )
		SELECT "B"."Author",
		string_agg("B"."email", ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    rows = [
        list(row[:1] + tuple([x for x in row[1].split(", ")], ))
        for row in rows if row[0] != -1
    ]
    #print(rows)
    for row in range(len(rows)):
        #print(type(rows[row]), len(rows[row]))
        for col in range(1, len(rows[row])):
            #print(row, col, rows[row][col])
            if len(rows[row][col]) > len("***LINK***") and rows[row][
                    col][:len("***LINK***")] == "***LINK***":
                #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:]))
                rows[row][col] = rows[row][col][len("***LINK***"):]

    for row in range(len(rows)):
        rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:]))
    #print(rows)

    #print(rows)
    print(len(rows))

    print(len(rows))
    gen_csv_from_tuples("email_files/user_to_email.csv", ["IdAuthor", "email"],
                        rows)
Exemple #3
0
def extract_user_to_ip_csv():
    query = """WITH "A" AS (SELECT
		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", 
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" )
		SELECT "B"."Author",
		string_agg(CAST("B"."repetitions" AS text) || ':' || "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    query = """WITH "A" AS (SELECT
		CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author",
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" )
		SELECT "B"."Author",
		string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" 
		FROM "B" GROUP BY "B"."Author";"""
    rows = make_query(query)
    #print(rows)
    print(len(rows))
    rows = [
        row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows
        if row[0] != -1
    ]
    print(len(rows))
    gen_csv_from_tuples("ip_files/user_to_ip.csv", ["IdAuthor", "IP"], rows)
Exemple #4
0
 def store_normalized_results(self, users, normalized_matrix, filename):
     head = ["IdAuthor1", "IdAuthor2"
             ] + [prefix for _, prefix in self.list_files]
     lst = [
         tuple([pair[0], pair[1]] + [x for x in values])
         for pair, values in zip(users, normalized_matrix)
     ]
     gen_csv_from_tuples(filename, head, lst)
Exemple #5
0
 def analyze_connected_components(self):
     G = self.generate_graph()
     for i, graph in enumerate(list(nx.connected_component_subgraphs(G))):
         num_nodes = graph.number_of_nodes()
         print("[-] Going for %d with %d" % (i, num_nodes))
         if num_nodes > 7:
             graph_lst = []
             for user, data in graph.nodes(data=True):
                 graph_lst.append((user, graph.degree(user)))
             graph_lst = sorted(graph_lst, key=lambda x: x[1], reverse=True)
             gen_csv_from_tuples("graphs_info/%d-%d.csv" % (num_nodes, i),
                                 ["User", "#"], graph_lst)
Exemple #6
0
def extract_ip_to_usage():
    query = """WITH "A" AS (SELECT
		regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip"
		FROM "Post"	WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'),
		"B" AS (SELECT "ip", count(*) as "repetitions" FROM "A" GROUP BY "ip" )
		SELECT "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] as "ip", "B"."repetitions" FROM "B";"""
    rows = make_query(query)
    #print(rows)
    print(len(rows))
    #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1]
    print(len(rows))
    gen_csv_from_tuples("ip_files/ip_count.csv", ["IP", "Reps"], rows)
Exemple #7
0
    def get_multfs(self, join_dict, matrix):
        num_pairs, num_features = matrix.shape

        tf_func = lambda x: (x.T / x.sum(axis=1)).T
        idf_func = lambda x: np.log(x.shape[0] / (x != 0).sum(axis=0))
        tf_idf_func = lambda x: tf_func(x) * idf_func(x)
        idf_smooth_func = lambda x: np.log(x.shape[0] /
                                           ((x != 0).sum(axis=0) + 1)) + 1
        idf_smooth_func2 = lambda x: np.log(1 + (x.shape[0] /
                                                 ((x != 0).sum(axis=0))))
        # Column normalization
        normalize = lambda matrix: (matrix - matrix.min(axis=0)) / (matrix.max(
            axis=0) - matrix.min(axis=0))

        multfs_func1 = lambda x: normalize(tf_idf_func(x).sum(axis=1)) * 100
        multfs_func2 = lambda x: normalize((x * idf_func(x)).sum(axis=1)) * 100
        multfs_func3 = lambda x: normalize(
            (x * idf_smooth_func(x)).sum(axis=1)) * 100
        multfs_func4 = lambda x: normalize(
            (x * idf_smooth_func2(x)).sum(axis=1)) * 100
        #tf = (matrix.T / matrix.sum(axis = 1)).T
        #idf = np.log(num_pairs / (matrix != 0).sum(axis = 0))

        #weights = (matrix != 0).sum(axis= 0)
        # Variation of tfidf
        #tf = weights / weights.sum()
        #idf = np.log(num_users / weights) #Inverse frequency of features.

        tfidf = tf_idf_func(matrix)

        multfs1 = multfs_func1(matrix)
        multfs2 = multfs_func2(matrix)
        multfs3 = multfs_func3(matrix)
        multfs4 = multfs_func4(matrix)

        lst_res = []
        for i, (u1, u2) in enumerate(join_dict.keys()):
            if multfs3[i] > 40.0 or multfs1[i] > 40.0:
                lst_res.append(
                    (u1, u2, multfs1[i], multfs2[i], multfs3[i], multfs4[i]))

        lst_res = sorted(lst_res, key=lambda x: x[4], reverse=True)

        gen_csv_from_tuples("multfs.csv", [
            "User 1", "User 2", "MultFS Pure TFIDF", "MultFS IDF",
            "MultFS Smooth IDF 1", "MultFS Smooth IDF 2"
        ], lst_res)
        return lst_res
Exemple #8
0
    def get_information_from_matrix(self, user_ind, sparse_matrix_dot):
        tic = time.time()
        lst_res = []
        inv_user_ind = {v: k for k, v in user_ind.items()}
        num_users = len(user_ind)

        #self.pprint("Transforming Matrix A", end='\r')
        #sparse_matrix_dot = sparse_matrix_dot.tocoo()
        #row, col, data = sparse_matrix_dot.row, sparse_matrix_dot.col, sparse_matrix_dot.data
        #self.pprint("[END] Transforming Matrix A")
        lst_res = []
        tx = sparse_matrix_dot.shape[0]
        print(sparse_matrix_dot.shape)
        for uind in range(tx):
            if uind % 100 == 0:
                self.pprint("Info Extraction",
                            "[%d Users Processed]" % (uind),
                            "[%d List Length]" % (len(lst_res)),
                            "[%0.3f Percentage]" % ((uind / tx) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            row = np.array(sparse_matrix_dot[uind].toarray())
            row = row.flatten()
            row[uind] = 0  # We do not consider the comparison with itself
            rmax = row.max()
            if rmax > 0:
                n = (row == rmax).sum()
                #max_inds = row.argsort()[-n:][::-1]
                max_uinds = np.argpartition(
                    row, -n
                )[-n:]  # it orders "-n" elements of the row, and then, it extracts the last n.

                for i in max_uinds:
                    lst_res.append((inv_user_ind[uind], inv_user_ind[i], rmax))

        lst_res = sorted(lst_res, key=lambda x: x[2], reverse=True)
        pickle_object(lst_res, self.dir + "results.pkl")
        self.pprint("[END] Info Extraction", "[%d Users Processed]" % (uind),
                    "[%d List Length]" % (len(lst_res)),
                    "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(),
                    get_elapsed_time(tic))

        gen_csv_from_tuples(self.dir + "results.csv",
                            ["User1", "User2", "Relation Value"], lst_res)

        return lst_res