def get_top_authors(top_n, json_filename): """ Gets top n authors based on the ranking generated from generate_author_ranking in analysis.author.ranking :param top_n: Number of top authors to be returned. :param json_filename: The JSON file from which author scores are generated. :return: Top authors and indices """ top_authors = set() top_authors_index = dict() author_scores = ranking.get(json_filename, output_filename=None, active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) top_authors_index[email_addr] = index if index == top_n: break return top_authors, top_authors_index
def get_top_authors(top_n, json_filename): """ Gets the top n authors based on the ranking generated from generate_author_ranking in analysis.author.ranking :param top_n: The number of top authors to be returned. :param json_filename: The JSON file from which author scores are generated. :return: Top authors and their indices """ top_authors = set() top_authors_index = dict() author_scores = ranking.get(json_filename, output_filename=None, active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) top_authors_index[email_addr] = index if index == top_n: break return top_authors, top_authors_index
def write_matrix(json_data, tree_filename="infomap/output/author_graph.tree"): """ Writes to a CSV file the Author Score, In-Degree, Out-Degree, Clustering Coeff, Module Flow coefficient ascertainend from the output of the infomaps algorithm. :param json_data: Path of the JSON file containing the dataset under analysis :param tree_filename: Path of the tree file generated by Infomap detection module :return: None """ top_authors = set() top_authors_data = dict() author_scores = ranking.get(active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) top_authors_data[email_addr] = [author_score] if index == 100: break print("Adding nodes to author's graph...") author_graph = nx.DiGraph() for msg_id, message in json_data.items(): if message['From'] in top_authors: if message['Cc'] is None: addr_list = message['To'] else: addr_list = message['To'] | message['Cc'] for to_address in addr_list: if to_address in top_authors: if author_graph.has_edge(message['From'], to_address): author_graph[message['From']][to_address]['weight'] *= \ author_graph[message['From']][to_address]['weight'] / (author_graph[message['From']][to_address]['weight'] + 1) else: author_graph.add_edge(message['From'], to_address, weight=1) author_graph_undirected = author_graph.to_undirected() clustering_coeff = nx.clustering(author_graph_undirected) in_degree_dict = author_graph.in_degree(nbunch=author_graph.nodes_iter()) out_degree_dict = author_graph.out_degree(nbunch=author_graph.nodes_iter()) for email_addr in top_authors: top_authors_data[email_addr].append(in_degree_dict[email_addr]) top_authors_data[email_addr].append(out_degree_dict[email_addr]) top_authors_data[email_addr].append(clustering_coeff[email_addr]) print("Parsing", tree_filename + "...") with open(tree_filename, 'r') as tree_file: for line in tree_file: if not line or line[0] == '#': continue line = line.split() if line[2][1:-1] in top_authors: top_authors_data[line[2][1:-1]].append(float(line[1])) tree_file.close() with open("top_authors_data.csv", 'w') as output_file: output_file.write( "Email Address,Author Score,In-Degree,Out-Degree,Clustering Coeff,Module Flow\n" ) for email_addr, data_list in top_authors_data.items(): output_file.write(email_addr + "," + ",".join([str(x) for x in data_list]) + "\n") output_file.close() print("Authors data written to file.")
def vertex_clustering(json_filename, nodelist_filename, edgelist_filename, foldername, time_limit=None, ignore_lat=False): """ This function performs vertex clustering on the dataset passed in the parameters and saves the dendrogram resulting from the vertex clustering as a PDF along with the visualization of the vertex cluster itself. It is recommended to limit these graphs to 200 authors as the visualization becomes incompehensible beyond that. :param json_filename: Path of the JSON file containing the dataset under analysis :param nodelist_filename: Path of the CSV file containing the list of nodes for the dataset under analysis :param edgelist_filename: Path of the CSV file containing the list of edges for the dataset under analysis :param time_limit: Time limit can be specified here in the form of a timestamp in one of the identifiable formats and all messages that have arrived after this timestamp will be ignored. :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored. :return: None """ json_data = dict() email_re = re.compile(r'[\w\.-]+@[\w\.-]+') if time_limit is None: time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z") msgs_before_time = set() time_limit = get_datetime_object(time_limit) print("All messages before", time_limit, "are being considered.") if not ignore_lat: with open(json_filename, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) json_obj['Time'] = datetime.datetime.strptime( json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z") if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group( 0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set( email_re.findall(json_obj['Cc']) ) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj else: lone_author_threads = get_lone_author_threads(False, nodelist_filename, edgelist_filename) with open(json_filename, 'r') as json_file: for chunk in lines_per_n(json_file, 9): json_obj = json.loads(chunk) json_obj['Message-ID'] = int(json_obj['Message-ID']) if json_obj['Message-ID'] not in lone_author_threads: json_obj['Time'] = datetime.datetime.strptime( json_obj['Time'], "%a, %d %b %Y %H:%M:%S %z") if json_obj['Time'] < time_limit: # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) from_addr = email_re.search(json_obj['From']) json_obj['From'] = from_addr.group( 0) if from_addr is not None else json_obj['From'] json_obj['To'] = set(email_re.findall(json_obj['To'])) json_obj['Cc'] = set( email_re.findall(json_obj['Cc']) ) if json_obj['Cc'] is not None else None # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc']) json_data[json_obj['Message-ID']] = json_obj print("JSON data loaded.") author_graph = igraph.Graph() author_graph.es["weight"] = 1.0 author_map = dict() """ Graphs can also be indexed by strings or pairs of vertex indices or vertex names. When a graph is indexed by a string, the operation translates to the retrieval, creation, modification or deletion of a graph attribute. When a graph is indexed by a pair of vertex indices or names, the graph itself is treated as an adjacency matrix and the corresponding cell of the matrix is returned. Assigning values different from zero or one to the adjacency matrix will be translated to one, unless the graph is weighted, in which case the numbers will be treated as weights. """ top_authors = set() author_scores = ranking.get(json_filename, None, active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) if index == 100: break index = 0 for id, node in json_data.items(): if node['From'] in top_authors: if node['From'] not in author_map: author_map[node['From']] = index author_graph.add_vertex(name=node['From'], label=node['From']) index += 1 for to_addr in node['To']: if to_addr in top_authors: if to_addr not in author_map: author_map[to_addr] = index author_graph.add_vertex(name=to_addr, label=to_addr) index += 1 if author_graph[node['From'], to_addr] == 0: author_graph.add_edge(node['From'], to_addr, weight=1) else: author_graph[node['From'], to_addr] += 1 if node['Cc'] is None: continue for to_addr in node['Cc']: if to_addr in top_authors: if to_addr not in author_map: author_map[to_addr] = index author_graph.add_vertex(name=to_addr, label=to_addr) index += 1 if author_graph[node['From'], to_addr] == 0: author_graph.add_edge(node['From'], to_addr, weight=1) else: author_graph[node['From'], to_addr] += 1 print("Nodes and Edges added to iGraph.") vertex_dendogram = author_graph.community_edge_betweenness( clusters=8, directed=True, weights="weight") igraph.plot(vertex_dendogram, foldername + "vd.pdf", vertex_label_size=3, bbox=(1200, 1200)) print("Dendrogram saved as PDF.") vertex_clustering_obj = author_graph.community_infomap( edge_weights=author_graph.es["weight"]) igraph.plot(vertex_clustering_obj, foldername + "vc.pdf", vertex_label_size=10, bbox=(1500, 1500), edge_color="gray") print("Vertex Clustering saved as PDF.") with open(foldername + "community_vertex_clustering.txt", 'w') as output_file: output_file.write(str(vertex_clustering_obj)) output_file.close()
print("Processing Mailbox:", mailbox) vertex_clustering(headers_filename, nodelist_filename, edgelist_filename, foldername) generate_hyperedge_distribution(nodelist_filename, edgelist_filename, headers_filename, foldername) generate_keyword_digest(mbox_filename, output_filename=foldername + "/author_keyword_digest.txt", author_uid_filename=author_uid_filename, json_filename=headers_filename, top_n=250, console_output=False) ranking.get(headers_filename, output_filename=foldername + "/tables/author_ranking.csv", active_score=2, passive_score=1) generate_wh_table_authors(nodelist_filename, edgelist_filename, foldername + '/tables/wh_table_authors.csv') conversation_refresh_times(headers_filename, nodelist_filename, edgelist_filename, foldername + 'plots', plot=True) generate_kmeans_clustering(mbox_filename, author_uid_filename=author_uid_filename, json_filename=headers_filename, output_filename=foldername + "/json/kmeans_clustering.json", top_n=250)
def write_matrix(json_data, tree_filename="infomap/output/author_graph.tree"): """ Writes to a CSV file the Author Score, In-Degree, Out-Degree, Clustering Coeff, Module Flow coefficient ascertainend from the output of the infomaps algorithm. :param json_data: Path of the JSON file containing the dataset under analysis :param tree_filename: Path of the tree file generated by Infomap detection module :return: None """ top_authors = set() top_authors_data = dict() author_scores = ranking.get(active_score=2, passive_score=1, write_to_file=False) index = 0 for email_addr, author_score in author_scores: index += 1 top_authors.add(email_addr) top_authors_data[email_addr] = [author_score] if index == 100: break print("Adding nodes to author's graph...") author_graph = nx.DiGraph() for msg_id, message in json_data.items(): if message['From'] in top_authors: if message['Cc'] is None: addr_list = message['To'] else: addr_list = message['To'] | message['Cc'] for to_address in addr_list: if to_address in top_authors: if author_graph.has_edge(message['From'], to_address): author_graph[message['From']][to_address]['weight'] *= \ author_graph[message['From']][to_address]['weight'] / (author_graph[message['From']][to_address]['weight'] + 1) else: author_graph.add_edge(message['From'], to_address, weight=1) author_graph_undirected = author_graph.to_undirected() clustering_coeff = nx.clustering(author_graph_undirected) in_degree_dict = author_graph.in_degree(nbunch=author_graph.nodes_iter()) out_degree_dict = author_graph.out_degree(nbunch=author_graph.nodes_iter()) for email_addr in top_authors: top_authors_data[email_addr].append(in_degree_dict[email_addr]) top_authors_data[email_addr].append(out_degree_dict[email_addr]) top_authors_data[email_addr].append(clustering_coeff[email_addr]) print("Parsing", tree_filename + "...") with open(tree_filename, 'r') as tree_file: for line in tree_file: if not line or line[0] == '#': continue line = line.split() if line[2][1:-1] in top_authors: top_authors_data[line[2][1:-1]].append(float(line[1])) tree_file.close() with open("top_authors_data.csv", 'w') as output_file: output_file.write("Email Address,Author Score,In-Degree,Out-Degree,Clustering Coeff,Module Flow\n") for email_addr, data_list in top_authors_data.items(): output_file.write(email_addr+","+",".join([str(x) for x in data_list])+"\n") output_file.close() print("Authors data written to file.")
for mailbox in mailbox_list: # Define directories path=os.path.abspath("lib/mlcatconfig/mlcat.cfg") path_ob= Config(mailbox) path_ob.read(path) path_ob.createVariables(); print("Processing Mailbox:", mailbox) vertex_clustering(path_ob.headers_filename, path_ob.nodelist_filename, path_ob.edgelist_filename, path_ob.foldername) generate_hyperedge_distribution(path_ob.nodelist_filename, path_ob.edgelist_filename, path_ob.headers_filename, path_ob.foldername) generate_keyword_digest(path_ob.mbox_filename, output_filename=path_ob.foldername+"/author_keyword_digest.txt", author_uid_filename=path_ob.author_uid_filename, json_filename=path_ob.headers_filename, top_n=250, console_output=False) ranking.get(path_ob.headers_filename, output_filename=path_ob.foldername+"/tables/author_ranking.csv", active_score=2, passive_score=1) generate_wh_table_authors(path_ob.nodelist_filename, path_ob.edgelist_filename, path_ob.foldername+'/tables/wh_table_authors.csv') conversation_refresh_times(path_ob.headers_filename, path_ob.nodelist_filename, path_ob.edgelist_filename, path_ob.foldername+'plots', plot=True) generate_kmeans_clustering(path_ob.mbox_filename, author_uid_filename=path_ob.author_uid_filename, json_filename=path_ob.headers_filename, output_filename=path_ob.foldername+"/json/kmeans_clustering.json", top_n=250) # For a range of months from Jan 2010 to Dec 2016, generate CL, RT curve fits yearly_curve_fit_coeffs = list() monthly_curve_fit_coeffs = list() for year in range(2015, 2017): for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']: if month in {'Jan', 'Mar', 'May', 'Jul', 'Aug', 'Oct', 'Dec'}: max_day = 31 elif month == 'Feb': max_day = 28 else: