def plot_encoding_B_functionality(all_files, changing_value, scale): plot_params = [] for X_val, X_files in all_files: current_X_points = [[],[]] for stats_file in X_files: stats = words_stats.load_stats(stats_file) num_words = stats.L + stats.D + stats.F if changing_value == 'F': current_X_points[0].append(stats.F) elif changing_value == 'L': current_X_points[0].append(stats.L) # all encodings except those for ['wow', 'wow',...] and their likes flow = [x for x in stats.encoding_flow if x[4] >= (num_words / 2)] # successful_encoding_flow = [x for x in flow if x[2]] # num_of_encoding_attempts = len(flow) # # if len(successful_encoding_flow) >= 5: # expected_links_needed_per_word = num_of_encoding_attempts / len(successful_encoding_flow) # else: # # flow = [x for x in stats.encoding_flow if x[4] > 1] # expected_links_needed_per_word = find_expected_num_of_links(stats, flow) expected_links_needed_per_word = min(find_expected_num_of_links(stats, flow),1000000) current_X_points[1].append(expected_links_needed_per_word) print 'current_X_points[0]: ', current_X_points[0] print 'current_X_points[1]: ', current_X_points[1] plot_params.append((current_X_points[0], current_X_points[1], X_val)) plt.figure(figsize=figsize) for x_params in plot_params: line_properties = lines_styles_params[x_params[2]] x_axis_values = x_params[0] y_axis_values = x_params[1] x_narray = np.array(x_axis_values) y_narray = np.array(y_axis_values) x_line_space = np.linspace(x_narray.min(), x_narray.max(), 40) f_smooth = interp1d(x_narray, y_narray, kind='slinear') plt.plot(x_line_space, f_smooth(x_line_space), line_properties[0], linewidth= line_properties[1], label=line_properties[2]) plt.ylabel('Expected Number of Links') plt.legend(loc=4, frameon=False, prop={'size':14}) plt.axis(scale) if changing_value == 'F': plt.xlabel('F (Number of Function Words)') plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_B.pdf', bbox_inches='tight') elif changing_value == 'L': plt.xlabel('L (Number of Link Words)') plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_C.pdf', bbox_inches='tight') plt.show()
def create_list_of_links_to_be_manually_inspected(path='/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results/'): file_names = [ f for f in listdir(path)] for stats_file in file_names: stats = words_stats.load_stats(stats_file, path) csv_str_list = [x[1] +','+str(x[2]) for x in stats.collected_words[1:]] file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results_csv/' + stats_file +'.csv' with open(file_name, 'w') as f: for link_line in csv_str_list: f.write('%s\n' % link_line)
def compute_all_download_times(path='/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results/'): file_names = [ f for f in listdir(path)] times_of_all_files = [] for i,file_name in enumerate(file_names): print 'starting file number: ',i, ' file name:', file_name stats = words_stats.load_stats(file_name, path) urls = [x[1] for x in stats.collected_words[1:]] current_file_times_sum = 0. for i,url in enumerate(urls): print 'currently at link: ', i download_time = time_page(url) current_file_times_sum += download_time print 'sum is: ', current_file_times_sum times_of_all_files.append(current_file_times_sum) timing_file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/timing.txt' with open(timing_file_name, 'w') as f: for timing in times_of_all_files: f.write('%s\n' % str(timing))
def plot_encoding_D(): # X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-14 19:33:38.758207.pkl' # X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-17 13:22:15.402834.pkl' # X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-17 13:09:06.839638.pkl' X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-17 21:27:54.405155.pkl' X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-17 22:10:22.532923.pkl' X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-17 22:56:16.653616.pkl' X_750_file = 'stats_749_1_1_2_0_tweet_1_2015-01-14 20:40:46.187508.pkl' # X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-14 19:33:38.758207.pkl' # X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-14 19:54:36.261821.pkl' # X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-14 20:16:07.154431.pkl' # X_750_file = 'stats_729_1_1_2_0_tweet_1_2015-01-14 20:40:46.187508.pkl' # X_100_file = 'stats_101_1_1_5_0_tweet_1_2015-01-11 13:40:06.319823.pkl' # X_250_file = 'stats_256_1_1_5_0_tweet_1_2015-01-11 16:33:32.267943.pkl' # X_500_file = 'stats_512_1_1_5_0_tweet_1_2015-01-13 20:01:11.988187.pkl' # X_750_file = 'stats_750_1_1_5_0_tweet_1_2015-01-14 00:16:50.589827.pkl' # X_750_file = 'stats_750_1_2_4_0_tweet_1_2015-01-14 01:51:00.545645.pkl' all_files = [(100,X_100_file),(250,X_250_file),(500,X_500_file),(750,X_750_file)] plt.figure(figsize=figsize) for X, stats_file in all_files: stats = words_stats.load_stats(stats_file) chunks_starts = [0] for i in range(1,len(stats.encoding_flow)): if stats.encoding_flow[i][0] != stats.encoding_flow[i-1][0]: chunks_starts.append(i) chunks = [] for i in range(0, len(chunks_starts)-1): start = chunks_starts[i] stop = chunks_starts[i+1] chunks.append(stats.encoding_flow[start:stop]) chunks.append(stats.encoding_flow[chunks_starts[-1]:]) num_words = stats.L + stats.D + stats.F good_chunks = [chunk for chunk in chunks if chunk[0][4] >= num_words / 2] num_links_required_in_successful_chunks = [x[-1][1] for x in good_chunks if x[-1][2]] if not num_links_required_in_successful_chunks: expected_num_links_required_in_good_chunks = [] for chunk in good_chunks: expected_links_needed = find_expected_num_of_links(stats, chunk) expected_num_links_required_in_good_chunks.append(expected_links_needed) num_links_required_in_successful_chunks = expected_num_links_required_in_good_chunks total_number_of_attempts = float(len(good_chunks)) cdf = [] for i in range(int(max(num_links_required_in_successful_chunks)) + 30): cdf.append(len([x for x in num_links_required_in_successful_chunks if x <= i]) / total_number_of_attempts) plt.plot(cdf, lines_styles_params[X][0], linewidth=lines_styles_params[X][1], label=lines_styles_params[X][2]) plt.axis([0, 20, 0., 1.1]) plt.xlabel('Number of Links') plt.ylabel('CDF') plt.legend(loc='best', frameon=False) plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_D.pdf', bbox_inches='tight') plt.show()
def plot(): # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_752_statsdata.txt_1000_0.txt.pkl') # stats2 = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_750_statsdata.txt_750_0.txt.pkl') # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_70_statsdata.txt_101_0.txt.pkl') # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_101_2015-01-01 12:35:25.180845.pkl') # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_8_94_2015-01-01 14:58:30.815052.pkl') # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_8_750_2015-01-01 23:48:44.081310.pkl') stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_750_2015-01-01 23:43:10.180217.pkl') values = stats.search_stats.values() # values += stats2.search_stats.values() num_links_required = [] total_number_of_attempts = 0 for words_search in values: search_end = [x for x in words_search if x[2] == 1.] total_number_of_attempts += max(1, len(search_end)) start_i = 0 for end in search_end: end_i = words_search[start_i:].index(end) num_links_required.append(end_i - start_i + 1) start_i = end_i + 1 total_number_of_attempts = float(total_number_of_attempts) cdf = [] for i in range(131): cdf.append(len([x for x in num_links_required if x <= i]) / total_number_of_attempts) plt.axis([0, 130, 0., 1.1]) plt.plot(cdf, '-r') plt.xlabel('# of links.txt passed to find feasible essence') plt.ylabel('CDF') plt.grid(True) plt.show() # x = num_links_required # fig = plt.figure() # ax = fig.add_subplot(1,1,1) # n, bins, patches=ax.hist(x, 30) # ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: ('%.2f')%(y/total_number_of_attempts))) # ax.set_ylabel('% of links.txt') # # ax.set_autoscaley_on(False) # plt.ylim(ymin=0,ymax=12.3) # the 101 / 7 stats /Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_101_2015-01-01 12:35:25.180845.pkl # # plt.ylim(ymin=0,ymax=13.8) # the 752 / 7 stats /Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_752_statsdata.txt_1000_0.txt.pkl # plt.xlabel('# of links.txt passed to find feasible essence') # plt.show() # plt.axis([0, 130, 0., 1.1]) # plt.hist(num_links_required, bins=50) # plt.grid(True) # plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(num_links_required, weights=np.zeros_like(num_links_required) + 100. / total_number_of_attempts, bins= [x for x in range(0,132,2)]) # n, bins, patches = ax.hist(num_links_required, bins=100, normed=1, cumulative=0) plt.axis([0, 130, 0., 100]) plt.xlabel('# of links.txt passed to find feasible essence') plt.ylabel('% of links.txt') plt.show() # ax.set_ylim([0,100]) # ax.set_ylim((0,10)) # ax.set_xlim((1,20)) # plt.ylim(ymin=0,ymax=10) # num_links_used = sorted([x[2] for x in stats.words_collected]) # cdf = [0] * (num_links_used[-1] + 1) # for i in range(len(num_links_used)): # cdf[num_links_used[i]] += 1 # for i in range(1,len(cdf)): # cdf[i] += cdf[i-1] # for i in range(len(cdf)): # cdf[i] /= float(cdf[-1]) # # cdf += [1.]*30 # # plt.axis([0, 100, 0., 1.1]) # plt.plot(cdf, '-r') # plt.xlabel('# of links.txt passed to find feasible essence') # plt.ylabel('CDF') # plt.grid(True) # plt.show() # # num_links_used = [x[2] for x in stats.words_collected] # plt.axis([0.5, 100, 0, 30]) # plt.hist(num_links_used,bins=50) # plt.xlabel('# of links.txt passed to find feasible essence') # plt.ylabel('% of links.txt') # plt.grid(True) # plt.show() uncut_essence_size = [] percent_of_words_in_uncut_essence = [] percent_of_words_in_essence = [] for point_list in stats.search_stats.values(): for point in point_list: uncut_essence_size.append(point[0]) percent_of_words_in_uncut_essence.append(point[1]) percent_of_words_in_essence.append(point[2]) plt.xlabel('size of uncut essence') plt.ylabel('words in uncut essence') plt.plot(uncut_essence_size, percent_of_words_in_uncut_essence, 'or') plt.axis([0, 101, 0, 1.1]) plt.show() plt.xlabel('size of uncut essence') plt.ylabel('words in essence') plt.plot(uncut_essence_size, percent_of_words_in_essence, 'ob') plt.axis([0, 101, 0, 1.1]) plt.show()