def analyze_days(headers): """ Creates a bar chart showing the number of emails sent per day""" util.log_print("Running Day of Week Analysis") days_of_week = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] email_days = list(map(lambda x: x["Date"][0].split(",")[0], headers)) day_counts = [] for day in days_of_week: day_counts.append(email_days.count(day)) # Display statistics util.display_stats(day_counts, "Statistics for days on which emails are set:") # Configure bar chart num_days = len(days_of_week) ind = np.arange(num_days) bar_width = 0.6 chart, ax = plt.subplots() rectangles = ax.bar(ind, day_counts, bar_width, color='r', alpha=0.6) ax.set_ylabel("Number of emails") ax.set_xlabel("Day") ax.set_title("Number of emails per day") ax.set_xticks(ind + bar_width / 20) ax.set_xticklabels(("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")) for rect in rectangles: height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2., 1 * height, '%d' % int(height), ha='center', va='bottom') plt.savefig("../../res/images/days.png") plt.show()
def analyze_years(headers): """ Creates a line chart showing the number of emails sent per year """ util.log_print("Running Year Analysis") email_years = list( map( lambda x: x["Date"][0].split(" ")[3].replace("2000", "x").replace( "000", "200").replace("x", "2000"), headers)) unique_years = list(set(email_years)) unique_years.sort() year_counts = [] for year in unique_years: year_counts.append(email_years.count(year)) # Display statistics util.display_stats(year_counts, "Statistics for years in which emails are sent:") # Configure line chart unique_years = list(map(int, unique_years)) plt.plot(unique_years, year_counts, color='r', alpha=0.6) plt.xlim(1979, 2044) plt.ylabel('Number of Emails') plt.xlabel('Year') plt.title('Number of emails per year') plt.savefig("../../res/images/years.png") plt.show()
def write_to_data_file(data_file_name, num_files): """ Retrieves data maps from write queue and appends it to specified data file. """ global data_to_write_queue progress_bar = util.ProgressBar(num_files, 'Extracting headers', 73) counter = 0 with open(data_file_name, FILE_WRITE_MODE) as data_file: while True: # Keep retrieving header data from write queue until timeout exception # is raised (should mean that no more data will be added to the queue). try: data = data_to_write_queue.get(block=True, timeout=QUEUE_TIMEOUT) stringified_data = util.stringify_headers(data) data_file.write('{0}\n'.format(stringified_data)) counter += 1 # Update progress bar. progress_bar.update(counter) except queue.Empty: # Finish up the writing process. progress_bar.clean() util.log_print('{0} entries written'.format(counter)) break except Exception as error: print(error) util.log_print('{0} entries written'.format(counter)) break
def main(): """ Setup script for header extraction and writing. """ args = create_arg_parser().parse_args() initialise_global_variables(args.ignore_x_headers) start_time = time.time() acquire_headers_and_write(args.path, args.file) end_time = time.time() execution_time = (end_time - start_time - QUEUE_TIMEOUT) / 60 util.log_print('{0:.4f} min'.format(execution_time))
def main(): """ Setup for data cleaning. """ args = create_arg_parser().parse_args() check_is_valid_file(args.file) initialise_global_variables(args.file) start_time = time.time() clean_data(args.file) end_time = time.time() execution_time = (end_time - start_time - QUEUE_TIMEOUT) / 60 util.log_print('{0:.4f} min'.format(execution_time))
def read_headers(filename): """ Reads the headers from the specified file and returns them as a list """ util.log_print("Reading Headers") headers = [] counter = 0 with open(filename) as file: with progressbar.ProgressBar(max_value=NUMBER_OF_HEADERS) as bar: for line in file: headers.append(json.loads(line)) bar.update(counter) counter += 1 return headers
def write_anaonymized_headers(current_file_name, headers_list): """ Writes anonymized headers to a file. """ # Create new file name from given file name. new_file = util.create_new_data_file_name(current_file_name, ANON_FILE_NAME_ADDITION) counter = 0 # Write header content to new file. with open(new_file, FILE_WRITE_MODE) as data_file: for headers in headers_list: data = util.stringify_headers(headers) data_file.write('{0}\n'.format(data)) counter += 1 util.log_print("{} entries anonymized".format(counter))
def get_email_file_names(directory): """ Recursively collect all file names in specified directory. """ file_names = [] # Create spinner to show that script is busy processing. spinner = util.Spinner('Seeking files in directory: "{0}"'.format(directory)) spinner.start() # Perform a recursive walk in specified directory. for directory_path, directories, files in os.walk(directory): for file_name in files: file_names.append(os.path.join(directory_path, file_name)) spinner.stop() util.log_print('Found {0} files'.format(len(file_names))) return file_names
def read_headers(filename): """ Reads the headers from the specified file and returns them as a list """ # Get number of entries in given dataset file. NUMBER_OF_HEADERS = util.file_line_count(filename) # Continue reading of headers. util.log_print("{} entries found".format(NUMBER_OF_HEADERS)) headers = [] counter = 0 with open(filename) as file: with progressbar.ProgressBar(max_value=NUMBER_OF_HEADERS) as bar: for line in file: headers.append(json.loads(line)) bar.update(counter) counter += 1 return headers
def analyze_domains(headers, top): """ Creates a horizontal bar chart showing the number of emails sent by the top domains """ util.log_print("Running Domain Analysis") valid_headers = list( filter(lambda h: len(h["From"][0].split("@")) == 2, headers)) domains = list( map(lambda h: h["From"][0].split("@")[1].split(".")[0], valid_headers)) unique_domains = set(domains) domain_counts = {} counter = 0 with progressbar.ProgressBar(max_value=len(unique_domains)) as bar: for domain in unique_domains: domain_counts[domain] = domains.count(domain) bar.update(counter) counter += 1 sorted_domain_counts = sorted(domain_counts.items(), key=operator.itemgetter(1)) sorted_domain_counts.reverse() chart_domains = [] chart_domain_counts = [] print("Top {0} domains that sent emails:".format(top)) for x in range(top): chart_domains.append(sorted_domain_counts[x][0]) chart_domain_counts.append(sorted_domain_counts[x][1]) # Print results print("{0}. {1} - {2} emails sent".format(x + 1, sorted_domain_counts[x][0], sorted_domain_counts[x][1])) # Draw horizontal bar chart plt.rcdefaults() fig, ax = plt.subplots() y_pos = np.arange(len(chart_domains)) ax.barh(y_pos, chart_domain_counts, align='center', color='green', ecolor='black') ax.set_yticks(y_pos) ax.set_yticklabels(chart_domains) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('Number of emails sent') ax.set_title('Emails Sent per Domain') plt.savefig("../../res/images/domainsA.png") plt.show()
def analyze_subjects(headers, words_to_strip): """ Creates a word cloud of all subjects""" util.log_print("Running Subject Analysis") # Map headers to subjects subjects = list(map(lambda h: h["Subject"][0], headers)) text = " ".join(subjects) # Strip specified words from subjects for word in words_to_strip: text = text.replace(word, " ") # Generate and save world cloud word_cloud = WordCloud(width=1000, height=500, stopwords=set(STOPWORDS)).generate(text) plt.figure(figsize=(15, 8)) plt.imshow(word_cloud) plt.axis("off") plt.show() plt.imsave("../../res/images/sub.png", word_cloud)
def read_unclean_data_file(unclean_data_file_name): """ Reads unclean data file and assigns unclean data lines to worker pool. """ # Setup a worker pool for cleaning. pool = Pool(POOL_PROCESSOR_COUNT) # Attempt to open the unclean data file for reading. try: with open(unclean_data_file_name, FILE_READ_MODE) as unclean_data_file: # Fetch every data line in the file. for data_line in unclean_data_file: # Assign an unclean data line for cleaning to a worker process. pool.apply_async(clean_data_line, args=(data_line, )) # If there was an error let the user know. except: util.log_print( 'Error reading file: "{0}"'.format(unclean_data_file_name)) pool.terminate() return # Wait for worker pool to finish up. pool.close() pool.join()
def get_max_senders(headers, top): """ Creates a bar chart showing the number of emails sent by the top senders """ util.log_print("Running Max Senders Analysis") email_addresses = list(map(lambda h: h["From"][0].split("@")[0], headers)) unique_addresses = list(set(email_addresses)) address_counts = {} counter = 0 with progressbar.ProgressBar(max_value=len(unique_addresses)) as bar: for address in unique_addresses: address_counts[address] = email_addresses.count(address) bar.update(counter) counter += 1 sorted_address_counts = sorted(address_counts.items(), key=operator.itemgetter(1)) sorted_address_counts.reverse() graph_emails = [] graph_counts = [] for x in range(top): graph_emails.append(sorted_address_counts[x][0]) graph_counts.append(sorted_address_counts[x][1]) # Display statistics util.display_stats(graph_counts, "Statistics for emails sent per person:") # Configure bar chart plt.tight_layout() num_emails = len(graph_emails) ind = np.arange(num_emails) bar_width = 0.6 chart, ax = plt.subplots() rectangles = ax.bar(ind, graph_counts, bar_width, color='r', alpha=0.6) ax.set_ylabel("Number of emails") ax.set_xlabel("Sender") ax.set_title("Number of emails per sender") ax.set_xticks(ind + bar_width / 20) ax.set_xticklabels(graph_emails) plt.xticks(rotation=90) plt.savefig("../../res/images/senders.png") plt.show()
def clean_data(unclean_data_file_name): """ Orchestrates reading and writing data processes. """ global number_of_data_lines # Report original number of data lines. pre_log_entry = 'Found {0} entries in "{1}"'.format( number_of_data_lines.value, unclean_data_file_name) util.log_print(pre_log_entry) # Create a separate process to read unclean data file. read_process = Process(target=read_unclean_data_file, args=(unclean_data_file_name, )) read_process.start() # Write clean data in current process. clean_data_file_name = util.create_new_data_file_name( unclean_data_file_name, CLEAN_FILE_NAME_ADDITION) entries_written = write_clean_data_file(clean_data_file_name) # Report number of clean data lines. post_log_entry = '{0} entries written in "{1}"'.format( entries_written, clean_data_file_name) util.log_print(post_log_entry) # Wait for reading process to finish up. read_process.join()
def analyze_basic(headers): """ Perform basic analysis on the email data set """ util.log_print("Performing Basic Analysis") # Check how many people sent emails to themselves sent_to_self_count = len( list( filter(lambda header: header["From"][0] in header["To"][0], headers))) sent_to_self_percentage = round(sent_to_self_count / len(headers) * 100, 2) print("{0} Emails ({1}%) were sent from the senders to themselves".format( sent_to_self_count, sent_to_self_percentage)) # Check how many emails were sent from the same domain valid_headers = list( filter(lambda h: len(h["From"][0].split("@")) == 2, headers)) same_domain_count = len( list( filter( lambda d: d["From"][0].split("@")[1].split(".")[0] in " ".join( d["To"]), valid_headers))) same_domain_percentage = round(same_domain_count / len(headers) * 100, 2) print("{0} Emails ({1}%) were sent from the same domain".format( same_domain_count, same_domain_percentage)) # Check how many emails were sent to more than one recipient multiple_recipient_count = len( list(filter(lambda q: len(q["To"]) > 1, headers))) multiple_recipient_percentage = round( multiple_recipient_count / len(headers) * 100, 2) print("{0} Emails ({1}%) were sent to more than one recipient".format( multiple_recipient_count, multiple_recipient_percentage)) # Check how many emails were sent to a single recipient single_recipient_count = len( list(filter(lambda q: len(q["To"]) == 1, headers))) single_recipient_percentage = round( single_recipient_count / len(headers) * 100, 2) print("{0} Emails ({1}%) were sent to only one recipient".format( single_recipient_count, single_recipient_percentage))
def analyze_content_types(headers, is_charset): """ Creates a pie chart of all content types or charsets""" if is_charset: util.log_print("Running Charset Analysis") content_types = list( map(lambda h: h["Content-Type"][1].split("=")[1], headers)) else: util.log_print("Running Content Type Analysis") content_types = list(map(lambda h: h["Content-Type"][0], headers)) unique_types = list(set(content_types)) counts = [] for t in unique_types: counts.append(unique_types.count(t)) chart, ax1 = plt.subplots() ax1.pie(counts, labels=unique_types, autopct='%1.1f%%', shadow=True, startangle=90) ax1.axis('equal') plt.savefig("../../res/images/content" + str(is_charset) + ".png") plt.show()
def analyze_times(headers): """ Creates a line chart showing the number of emails sent per hour """ util.log_print("Running Time Analysis") hours = list( map(lambda x: int(x["Date"][0].split(" ")[4].split(":")[0]), headers)) unique_hours = list(set(hours)) unique_hours.sort() hours_count = [] for hour in unique_hours: hours_count.append(hours.count(hour)) # Display statistics util.display_stats(hours_count, "Statistics for hours in which emails are sent:") # Configure line chart plt.plot(unique_hours, hours_count, color='g', alpha=0.6) plt.xlim(0, 24) plt.ylabel('Number of Emails') plt.xlabel('Hour of Day') plt.title('Number of emails per hour') plt.savefig("../../res/images/hours.png") plt.show()
def analyze_months(headers): """ Creates a bar chart showing the number of emails sent per month """ util.log_print("Running Month Analysis") months_of_year = [ "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ] email_months = list(map(lambda x: x["Date"][0].split(" ")[2], headers)) month_counts = [] for month in months_of_year: month_counts.append(email_months.count(month)) # Display statistics util.display_stats(month_counts, "Statistics for months in which emails are sent:") # Configure bar chart num_days = len(months_of_year) ind = np.arange(num_days) bar_width = 0.6 chart, ax = plt.subplots() rectangles = ax.bar(ind, month_counts, bar_width, color='b', alpha=0.6) ax.set_ylabel("Number of emails") ax.set_xlabel("Month") ax.set_title("Number of emails per month") ax.set_xticks(ind + bar_width / 20) ax.set_xticklabels(months_of_year) for rect in rectangles: height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2., 1 * height, '%d' % int(height), ha='center', va='bottom') plt.savefig("../../res/images/months.png") plt.show()
def main(): """ Setup for data analysis. """ args = create_arg_parser().parse_args() # Read headers into list headers = read_headers(args.file) # Perform basic analysis on the full data set af.analyze_basic(headers) # Perform exploratory analysis on full data set util.log_print("Performing Analysis on the Full Data Set") bulk_analyze(headers) # Perform analysis on emails sent to multiple recipients util.log_print("Performing Analysis on Emails Sent to Multiple Recipients") multiple_rec_headers = list(filter(lambda h: len(h["To"]) > 1, headers)) bulk_analyze(multiple_rec_headers) # Perform analysis on emails sent to a single recipient util.log_print("Performing Analysis on Emails Sent to a Single Recipient") single_rec_headers = list(filter(lambda h: len(h["To"]) == 1, headers)) bulk_analyze(single_rec_headers)
args = parser.parse_args() if args.dataset != 'threshold' and args.dataset != 'full' and args.dataset != 'gwac': raise Exception('dataset type not supported') set_global_variables(args.dataset) if not os.path.exists(data_path): os.makedirs(data_path) if not os.path.exists(pic_path): os.makedirs(pic_path) p = Pool() for sigma in sigma_range: p.apply_async(gen, args=(sigma, args.dataset, step_in_second, step, sin_amp_threshold, continuous, sample_seconds_in_one_day, num_days, sample_points_in_one_day, history_len, data_path, pic_path, temp_path, temp_list, relative_amp_range, A_range, T_range, phi_range)) p.close() p.join() log_print("All subprocesses done.") #gen(0.0, args.dataset, step_in_second, step, sin_amp_threshold, # continuous, sample_seconds_in_one_day, num_days, # sample_points_in_one_day, history_len, data_path, pic_path, # temp_path, temp_list, relative_amp_range, A_range, T_range, # phi_range)
def check_is_valid_file(file_name): """ Check if the specified file exists. """ # If the file does not exist, let the user know and exit the script. if not is_file(file_name): util.log_print('File "{0} does not exist"'.format(file_name)) sys.exit()
def gen(sigma, dataset, step_in_second, step, sin_amp_threshold, continuous, sample_seconds_in_one_day, num_days, sample_points_in_one_day, history_len, data_path, pic_path, temp_path, temp_list, relative_amp_range, A_range, T_range, phi_range): gen_num = 0 for temp_file in temp_list: with open(os.path.join(temp_path, temp_file)) as file_handle: data_frame = np.loadtxt(file_handle) temp_y = data_frame[:, 1] temp_amp = abs(temp_y.min()) if dataset == 'threshold' or dataset == 'full': param_combinations = product(relative_amp_range, T_range, phi_range) elif dataset == 'gwac': param_combinations = product(A_range, T_range, phi_range) else: raise Exception('dataset type not supported') for A_or_relative_amp, T, phi in param_combinations: if dataset == 'threshold' or dataset == 'full': A = temp_amp / A_or_relative_amp if A <= 1.5: gen_num += 1 else: gen_num += 1 log_print("Number of samples to be generated: %d" % gen_num) ind = 0 for temp_file in temp_list: # 1.read gravitational microlensing signal template with open(os.path.join(temp_path, temp_file)) as file_handle: data_frame = np.loadtxt(file_handle) temp_x = data_frame[:, 0] temp_y = data_frame[:, 1] tE = float(temp_file.split('_')[0]) # 2.cutting of template i = 0 while True: if temp_y[i] != temp_y[i + 1]: break i += 1 cutted_temp_x = temp_x[i:-(i + 1)] cutted_temp_y = temp_y[i:-(i + 1)] temp_len = len(cutted_temp_x) # template length T' temp_amp = abs(cutted_temp_y.min()) # template amplitude A' if dataset == 'threshold' or dataset == 'full': param_combinations = product(relative_amp_range, T_range, phi_range) elif dataset == 'gwac': param_combinations = product(A_range, T_range, phi_range) else: raise Exception('dataset type not supported') for A_or_relative_amp, T, phi in param_combinations: # num of outliers num_outliers = 3 # sample point numbers in a period sin_period = int(T * sample_points_in_one_day) if dataset == 'gwac': # relative saliency A = A_or_relative_amp relative_amp = temp_amp / A else: # amplitude of sine curve relative_amp = A_or_relative_amp A = temp_amp / A_or_relative_amp if A > 1.5: continue # generate a single period of the background signal single_period_sin = A * np.sin( S.linspace(0 + phi, 2 * np.pi + phi, sin_period)) # final length if dataset == 'full' or dataset == 'threshold': final_len = history_len + temp_len else: final_len = history_len dup_time = final_len // sin_period res_len = final_len % sin_period basic_sin = np.tile(single_period_sin, dup_time) res_sin = single_period_sin[:res_len] basic_sin = np.concatenate([basic_sin, res_sin], axis=0) if continuous: final_x = [i * step for i in range(final_len)] noised_sin = basic_sin + normal(0, sigma, len(basic_sin)) else: uncontinuous_basic_sin = np.array([]) final_x = [] for day in range(num_days): # The phase of first section is fixed for injecting ML signal if day == 0: random_start_ind = 0 else: random_start_ind = randint(0, final_len // 2) uncontinuous_basic_sin = np.concatenate([ uncontinuous_basic_sin, basic_sin[random_start_ind:random_start_ind + sample_points_in_one_day] ], axis=0) day_x = [ day + i * step for i in range(sample_points_in_one_day) ] final_x.extend(day_x) assert len(basic_sin) == len(uncontinuous_basic_sin) noised_sin = uncontinuous_basic_sin \ + normal(0, sigma, len(basic_sin)) # We inject the ML signal into the first section of background signal # and then horizontallyreverse the signal to ensure that the ML signal # is overlapped on the desired phase of the background signal final_y = noised_sin final_y[:temp_len] = final_y[:temp_len] + cutted_temp_y final_y = final_y[::-1] assert len(final_y) == final_len assert len(final_x) == len(final_y) tE_col = np.tile(tE, final_len) temp_start_col = np.tile(final_x[-temp_len:][0], final_len) temp_end_col = np.tile(final_x[-temp_len:][-1], final_len) A_prime = temp_amp T_prime = cutted_temp_x[-1] - cutted_temp_x[0] # write data data_name = "%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.dat" % ( sigma, T_prime, T, A_prime, relative_amp, phi) with open(os.path.join(data_path, data_name), 'w') as data_file: for i in range(final_len): # injecting outlier points if randint(10000) == 0: if num_outliers > 0: final_y[i] = -A_prime + normal(0, sigma) num_outliers -= 1 print("%5.8f %5.2f %5.3f %5.8f %5.8f" % (final_x[i], final_y[i], tE_col[i], temp_start_col[i], temp_end_col[i]), file=data_file) # write image pic_name = "%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.png" % ( sigma, T_prime, T, A_prime, relative_amp, phi) pic_file = os.path.join(pic_path, pic_name) pylab.figure() pylab.plot(final_x, final_y, '.') pylab.title("%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.png" % (sigma, T_prime, T, A_prime, relative_amp, phi)) pylab.ylim(final_y.max() + 0.1, final_y.min() - 0.1) pylab.ylabel('mag') pylab.xlabel('t-t0(day)') pylab.savefig(pic_file) pylab.close() ind += 1 log_print("finished generating: %s/%s, %d samples generated" % (data_path, data_name, ind))