def content_affinity_vs_distance(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' DataAnalysis._plot_affinities('similarity') plt.xlabel('Distance (miles)') plt.ylabel('Hashtags sharing similarity') # plt.show() savefig(output_file)
def iid_vs_cumulative_distribution_and_peak_distribution(): TIME_UNIT_IN_SECONDS = 10.*60. output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' ltuo_iid_and_interval_stats = [data for data in FileIO.iterateJsonFromFile(f_iid_spatial_metrics, remove_params_dict=True)] ltuo_s_iid_and_interval_stats = sorted(ltuo_iid_and_interval_stats, key=itemgetter(0)) ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences = [(data[0], (data[1][0], data[1][2])) for data in ltuo_s_iid_and_interval_stats] total_peaks = sum([data[1][0] for data in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences])+0.0 x_iids = [] y_is_peaks = [] z_cumulative_percentage_of_occurrencess = [] for (iid, (is_peak, cumulative_percentage_of_occurrences)) in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences[:100]: print (iid, (is_peak, cumulative_percentage_of_occurrences)) x_iids.append((iid+1)*TIME_UNIT_IN_SECONDS/60) y_is_peaks.append(is_peak/total_peaks) z_cumulative_percentage_of_occurrencess.append(cumulative_percentage_of_occurrences) plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.plot(x_iids, y_is_peaks, marker='o', c='k') plt.ylabel('Distribution of hashtags') plt.xlabel('Hashtag peak (minutes)') plt.grid(True) plt.xlim(xmax=600) savefig(output_file_format%'peaks') plt.clf() plt.figure(num=None, figsize=(6,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.plot(x_iids, z_cumulative_percentage_of_occurrencess, lw=0, marker='o', c='k') # plt.xlabel('Minutes') plt.ylabel('CDF of occurrences') plt.xlabel('Time (Minutes)') plt.grid(True) plt.xlim(xmax=600) savefig(output_file_format%'cdf_occurrences_peak')
def ef_plot(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id()+'.png' data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] ltuo_hashtag_and_entropy_and_focus = map(itemgetter('hashtag', 'entropy', 'focus'), data) mf_norm_focus_to_entropies = defaultdict(list) for _, entropy, (_, focus) in ltuo_hashtag_and_entropy_and_focus: mf_norm_focus_to_entropies[round(focus, 2)].append(entropy) plt.figure(num=None, figsize=(6,3)) x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies)) for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems() if len(entropies)>0]) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.scatter(x_focus, y_entropy, s=50, lw=0, c='k') plt.xlim(xmin=-0.1, xmax=1.1) plt.ylim(ymin=-1, ymax=9) plt.xlabel('Mean hashtag focus') plt.ylabel('Mean hashtag entropy') plt.grid(True) savefig(output_file) ltuo_hashtag_and_r_entropy_and_focus =\ sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True) ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2)) hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0] print list(hashtags[:20]) print list(reversed(hashtags))[:20]
def temporal_affinity_vs_distance(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' DataAnalysis._plot_affinities('adoption_lag') plt.xlabel('Distance (miles)') plt.ylabel('Hashtag adoption lag (hours)') # plt.show() savefig(output_file)
def generate_data_for_significant_nei_utm_ids(): output_file = GeneralMethods.get_method_id()+'.json' so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {} for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags = sorted(list(so_hashtags)) mf_utm_id_to_vector = {} for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector) for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()): print '%s of %s'%(i+1, len(mf_utm_id_to_vector)) ltuo_utm_id_and_vector = [(utm_id, vector)] for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]: if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id: ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id])) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors) dfm_dict = cjson.decode(df_utm_vectors_json) mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames)) utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id] dfm_dict['prediction_variable'] = utm_id_colname dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname, df_utm_vectors.colnames) dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0])) FileIO.writeToFileAsJson(dfm_dict, output_file)
def significant_nei_utm_ids(): output_folder = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'/%s.png' for i, data in enumerate(FileIO.iterateJsonFromFile(f_significant_nei_utm_ids, remove_params_dict=True)): utm_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(data['utm_id']) nei_utm_lat_longs = map( lambda nei_utm_id: UTMConverter.getLatLongUTMIdInLatLongForm(nei_utm_id), data['nei_utm_ids'] ) if nei_utm_lat_longs: output_file = output_folder%('%s_%s'%(utm_lat_long)) plotPointsOnWorldMap(nei_utm_lat_longs, blueMarble=False, bkcolor='#CFCFCF', lw = 0, color = '#EA00FF', alpha=1.) _, m = plotPointsOnWorldMap([utm_lat_long], blueMarble=False, bkcolor='#CFCFCF', lw = 0, color = '#2BFF00', s = 40, returnBaseMapObject=True, alpha=1.) for nei_utm_lat_long in nei_utm_lat_longs: m.drawgreatcircle(utm_lat_long[1], utm_lat_long[0], nei_utm_lat_long[1], nei_utm_lat_long[0], color='#FFA600', lw=1.5, alpha=1.0) print 'Saving %s'%(i+1) savefig(output_file)
def spatial_metrics_cdf(): output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' def plot_graph(locality_measures, id): mf_apprx_to_count = defaultdict(float) for measure in locality_measures: mf_apprx_to_count[round(measure,3)]+=1 total_hashtags = sum(mf_apprx_to_count.values()) current_val = 0.0 x_measure, y_distribution = [], [] for apprx, count in sorted(mf_apprx_to_count.iteritems(), key=itemgetter(0)): current_val+=count x_measure.append(apprx) y_distribution.append(current_val/total_hashtags) plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15, wspace=0) plt.scatter(x_measure, y_distribution, lw=0, marker='o', c='k', s=25) plt.ylim(ymax=1.2) if id!='Coverage': plt.xlabel('%s'%id) else: plt.xlabel('%s (miles)'%id) plt.ylabel('CDF') plt.grid(True) savefig(output_file_format%('cdf_'+id)) def plot_coverage(locality_measures, id): mf_apprx_to_count = defaultdict(float) for measure in locality_measures: mf_apprx_to_count[round(measure,3)]+=1 total_hashtags = sum(mf_apprx_to_count.values()) current_val = 0.0 x_measure, y_distribution = [], [] for apprx, count in sorted(mf_apprx_to_count.iteritems(), key=itemgetter(0)): current_val+=count x_measure.append(apprx) y_distribution.append(current_val/total_hashtags) plt.figure(num=None, figsize=(4.3,3)) ax = plt.subplot(111) ax.set_xscale('log') plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15, wspace=0) plt.scatter(x_measure, y_distribution, lw=0, marker='o', c='k', s=25) plt.ylim(ymax=1.2) if id!='Coverage': plt.xlabel('%s'%id) else: plt.xlabel('Spread (miles)') plt.ylabel('CDF') plt.xlim(xmin=1.) plt.grid(True) savefig(output_file_format%('cdf_'+id)) data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] focuses = map(itemgetter(1), map(itemgetter('focus'), data)) entropies = map(itemgetter('entropy'), data) coverages = map(itemgetter('spread'), data) print 'Mean entropy: ', np.mean(entropies) print 'Mean focus: ', np.mean(focuses) print 'Median entropy: ', np.median(entropies) print 'Median focus: ', np.median(focuses) plot_graph(focuses, 'Focus') plot_graph(entropies, 'Entropy') plot_coverage(coverages, 'Spread')
def hashtag_groups_dot_files(association_measure_file=f_fisher_exact_association_measure): output_file_format = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+\ '/'+association_measure_file.split('/')[-1]+'/%s.dot' for line_no, data in\ enumerate(FileIO.iterateJsonFromFile(association_measure_file, remove_params_dict=True)): _, _, edges = data graph = nx.Graph() for edge in edges: u,v,attr_dict = edge u = unicode(u).encode('utf-8') v = unicode(v).encode('utf-8') graph.add_edge(u,v, attr_dict) output_file = output_file_format%line_no print 'Writing file: ', output_file FileIO.createDirectoryForFile(output_file) nx.write_dot(graph, output_file)
def top_k_locations_on_world_map(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' ltuo_location_and_occurrence_count = [] for location_object in\ FileIO.iterateJsonFromFile(f_dense_hashtag_distribution_in_locations, remove_params_dict=True): ltuo_location_and_occurrence_count.append([ location_object['location'], location_object['occurrences_count'] ]) ltuo_lid_and_r_occurrence_count = sorted(ltuo_location_and_occurrence_count, key=itemgetter(1), reverse=True) # for i, d in enumerate(ltuo_lid_and_r_occurrence_count): # print i, d # exit() lids = zip(*ltuo_lid_and_r_occurrence_count)[0][:200] points = map(UTMConverter.getLatLongUTMIdInLatLongForm, lids) plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c='m', lw = 0, alpha=1.) savefig(output_file)
def peak_stats(): TIME_UNIT_IN_SECONDS = 10.*60. output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] peaks = map(itemgetter('peak_iid'), data) peaks = filter(lambda i: i<288, peaks) ltuo_peak_and_count = [(peak, len(list(ito_peaks))) for peak, ito_peaks in groupby(sorted(peaks)) ] ltuo_s_peak_and_count = sorted(ltuo_peak_and_count, key=itemgetter(0)) current_count = 0.0 total_count = len(peaks)+0. print total_count ltuo_peak_and_cdf = [] for peak, count, in ltuo_s_peak_and_count: current_count+=count ltuo_peak_and_cdf.append([(peak+1)*TIME_UNIT_IN_SECONDS/(60.), current_count/total_count ]) x_peaks, y_cdf = zip(*ltuo_peak_and_cdf) plt.figure(num=None, figsize=(4.3,3)) ax=plt.subplot(111) ax.set_xscale('log') plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15) plt.scatter(x_peaks, y_cdf, c='k', s=50, lw=0) plt.xlabel('Time (minutes)') plt.ylabel('CDF') plt.xlim(xmin=5.) plt.grid(True) # plt.show() savefig(output_file_format%'peak_cdf') plt.clf() # plt.figure(num=None, figsize=(4.3,3)) ax=plt.subplot(111) ax.set_xscale('log') ax.set_yscale('log') x_peaks, y_counts = zip(*ltuo_s_peak_and_count) x_peaks = [(peak+1)*TIME_UNIT_IN_SECONDS/(60.) for peak in x_peaks] y_counts = [count/total_count for count in y_counts] plt.scatter(x_peaks, y_counts, c='k', s=50, lw=0) plt.xlabel('Time (minutes)') plt.ylabel('Distribution of hashtags') plt.xlim(xmin=5) plt.ylim(ymax=1., ymin=0.00005) plt.grid(True) savefig(output_file_format%'peak_dist')
def significant_nei_utm_ids(): mf_utm_id_to_valid_nei_utm_ids = {} def get_utm_vectors(): so_hashtags = set() for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags, ltuo_utm_id_and_vector = sorted(list(so_hashtags)), [] for i, utm_object in enumerate(FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True)): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) ltuo_utm_id_and_vector.append((utm_object['utm_id'], robjects.FloatVector(utm_id_vector))) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) return df_utm_vectors output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id() df_utm_vectors = get_utm_vectors() # print df_utm_vectors.nrow # exit() utm_colnames = df_utm_vectors.colnames mf_utm_id_to_utm_colnames = dict(zip(sorted(mf_utm_id_to_valid_nei_utm_ids), utm_colnames)) mf_utm_colnames_to_utm_id = dict(zip(utm_colnames, sorted(mf_utm_id_to_valid_nei_utm_ids))) for i, utm_colname in enumerate(utm_colnames): utm_id = mf_utm_colnames_to_utm_id[utm_colname] prediction_variable = utm_colname print i, utm_id predictor_variables = [mf_utm_id_to_utm_colnames[valid_nei_utm_ids] for valid_nei_utm_ids in mf_utm_id_to_valid_nei_utm_ids[utm_id] if valid_nei_utm_ids in mf_utm_id_to_utm_colnames and valid_nei_utm_ids != utm_id ] selected_utm_colnames = R_Helper.variable_selection_using_backward_elimination( df_utm_vectors, prediction_variable, predictor_variables, debug=True ) nei_utm_ids = [mf_utm_colnames_to_utm_id[selected_utm_colname] for selected_utm_colname in selected_utm_colnames] print 'Writing to: ', output_file FileIO.writeToFileAsJson({'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}, output_file)
def plot_global_influencers(ltuo_model_id_and_hashtag_tag): tuples_of_boundary_and_boundary_label = [ ([[-90,-180], [90, 180]], 'World', 'm'), ] for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: print model_id, hashtag_tag tuples_of_location_and_color = [] for boundary, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: tuo_location_and_influence_scores = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary) tuo_location_and_influence_scores = sorted(tuo_location_and_influence_scores, key=itemgetter(1))[:10] locations = zip(*tuo_location_and_influence_scores)[0] for location in locations: tuples_of_location_and_color.append([getLocationFromLid(location.replace('_', ' ')), boundary_color]) locations, colors = zip(*tuples_of_location_and_color) plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0, alpha=1.) for _, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: plt.scatter([0], [0], label=boundary_label, c=boundary_color, lw = 0) # plt.legend(loc=3, ncol=4, mode="expand",) # plt.show() savefig(fld_results%(GeneralMethods.get_method_id()) +'%s_%s.png'%(model_id, hashtag_tag))
def entropy_examples(): output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] ltuo_hashtag_and_num_of_occurrences_and_entropy =\ map( itemgetter('hashtag', 'num_of_occurrenes', 'entropy'), data ) ltuo_hashtag_and_num_of_occurrences_and_entropy =\ map( lambda (h, n, e): (h, n, round(e,0)), ltuo_hashtag_and_num_of_occurrences_and_entropy ) for entropy, entropy_data in \ GeneralMethods.group_items_by(ltuo_hashtag_and_num_of_occurrences_and_entropy, itemgetter(2)): entropy_data.sort(key=itemgetter(1)) hashtags = map(itemgetter(0), entropy_data) print entropy, len(entropy_data), hashtags[:25]
def compare_zones_with_test_set(ltuo_model_id_and_hashtag_tag, test_model_id): output_file = fld_results%GeneralMethods.get_method_id()+'results.csv' GeneralMethods.runCommand('rm -rf %s'%output_file) mf_model_id_to_misrank_accuracies = defaultdict(list) mf_model_id_to_mf_location_to_zone_id = {} for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: no_of_zones, ltuo_location_and_influence_score_and_zone_id = Experiments.get_location_with_zone_ids(model_id, hashtag_tag) locations, influence_scores, zone_ids = zip(*ltuo_location_and_influence_score_and_zone_id) mf_model_id_to_mf_location_to_zone_id[model_id] = dict(zip(locations, zone_ids)) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): # print hashtag_count # if hashtag_count==10: break; ltuo_location_and_occurrence_time = sorted(ltuo_location_and_occurrence_time, key=itemgetter(1)) # hashtag_zone_ids = [for ltuo_location, _ in ltuo_location_and_occurrence_time] locations = reduce(InfluenceAnalysis._to_locations_based_on_first_occurence, zip(*ltuo_location_and_occurrence_time)[0], []) # mf_location_to_hashtags_location_rank = dict(zip(locations, range(len(locations)))) # for hashtag_count, (hashtag, ltuo_location_and_pure_influence_score) in \ # enumerate(Experiments.load_ltuo_test_hashtag_and_ltuo_location_and_pure_influence_score(test_model_id)): # locations = zip(*ltuo_location_and_pure_influence_score)[0] for model_id, mf_location_to_zone_id in \ mf_model_id_to_mf_location_to_zone_id.iteritems(): models_location_rank = [mf_location_to_zone_id[location] for location in locations if location in mf_location_to_zone_id] # print models_location_rank if len(models_location_rank)>1: misrank_accuracies = map( InfluenceAnalysis._get_rank_accuracy, zip(models_location_rank, [models_location_rank]*len(models_location_rank)) ) mf_model_id_to_misrank_accuracies[model_id].append(np.mean(misrank_accuracies)) #Random model # random_location_rank = range(len(locations)) random_location_rank = models_location_rank random.shuffle(random_location_rank) random_misrank_accuracies = map( InfluenceAnalysis._get_rank_accuracy, zip(random_location_rank, [random_location_rank]*len(random_location_rank)) ) data = ', '.join([str(hashtag_count), str(len(ltuo_location_and_occurrence_time)), str(np.mean(misrank_accuracies)), str(np.mean(random_misrank_accuracies)), str(len(models_location_rank))]) FileIO.writeToFile(data, output_file)
def ef_plots_for_peak(): output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' def getNearestNumber(num): return (int(round(num,2)*100/100)*100 + int((round(num,2)*100%100)/3)*3)/100. def plot_correlation_ef_plot(condition, id, hashtags, focuses, entropies, peaks): TIME_UNIT_IN_SECONDS = 10.*60. mf_norm_focus_to_entropies = defaultdict(list) mf_norm_focus_to_peaks = defaultdict(list) for focus, entropy, peak in zip(focuses,entropies, peaks): if condition(peak): mf_norm_focus_to_entropies[round(focus, 2)].append(entropy) mf_norm_focus_to_peaks[round(focus, 2)].append(peak) x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies)) for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems() if len(entropies)>5]) _, z_peak = zip(*[(norm_focus, np.mean(peaks)*TIME_UNIT_IN_SECONDS/60) for norm_focus, peaks in mf_norm_focus_to_peaks.iteritems() if len(peaks)>5]) plt.figure(num=None, figsize=(6,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) cm = matplotlib.cm.get_cmap('cool') sc = plt.scatter(x_focus, y_entropy, c=z_peak, cmap=cm, s=50, lw=0,) plt.colorbar(sc) plt.xlim(xmin=-0.1, xmax=1.1) plt.ylim(ymin=-1, ymax=9) plt.xlabel('Mean hashtag focus') plt.ylabel('Mean hashtag entropy') plt.grid(True) savefig(output_file_format%id) ltuo_hashtag_and_entropy_and_focus = zip(hashtags, entropies, focuses) ltuo_hashtag_and_r_entropy_and_focus = sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True) ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2)) hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0] print id, list(hashtags) print id, list(reversed(hashtags)) data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] hashtags = map(itemgetter('hashtag'), data) focuses = map(itemgetter(1), map(itemgetter('focus'), data)) entropies = map(itemgetter('entropy'), data) peaks = map(itemgetter('peak_iid'), data) def gt_288(peak): if 288>peak and peak<1008: return True def lt_6(peak): if peak < 6: return True def lt_144(peak): if peak < 144: return True plot_correlation_ef_plot(gt_288, 'gt_288', hashtags, focuses, entropies, peaks) plot_correlation_ef_plot(lt_6, 'lt_6', hashtags, focuses, entropies, peaks)
def location_influence_plots(model_ids, no_of_bins_for_influence_score=100): for model_id in model_ids: output_file_format = fld_results%(GeneralMethods.get_method_id()) + '%s_%s.png' tuo_input_location_and_label_and_marking_locations = [ # [ '40.6000_-73.9500', 'new_york', ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '29.7250_-97.1500','30.4500_-95.7000']], ['29.7250_-97.1500', 'austin', ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '39.1500_-83.3750','30.4500_-95.7000', '40.6000_-73.9500']], # ['30.4500_-95.7000', 'college_station', ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '29.7250_-97.1500','30.4500_-95.7000', '40.6000_-73.9500']], ] tuo_location_and_tuo_neighbor_location_and_influence_score = \ Experiments.load_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(model_id) for input_location, label, marking_locations in tuo_input_location_and_label_and_marking_locations: for location, tuo_neighbor_location_and_influence_score in \ tuo_location_and_tuo_neighbor_location_and_influence_score: if input_location==location: InfluenceAnalysis._plot_scores(tuo_neighbor_location_and_influence_score, marking_locations, no_of_bins_for_influence_score) plt.xlim(-1,1); plt.ylim(ymin=0.0) plt.show() savefig(output_file_format%(label, model_id)) break
def hashtag_locations_distribution_loglog(): ltuo_no_of_locations_and_count = [] for data in FileIO.iterateJsonFromFile(f_hashtag_and_location_distribution, remove_params_dict=True): if data[0]=='location' : ltuo_no_of_locations_and_count.append(data[1:]) output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' no_of_locations, counts = zip(*ltuo_no_of_locations_and_count) plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9, left=0.17) ax = plt.subplot(111) ax.set_xscale('log') ax.set_yscale('log') plt.scatter(no_of_locations, counts, c='k') plt.xlabel('No. of locations') plt.ylabel('No. of hashtags') plt.xlim(xmin=1/10, ) plt.ylim(ymin=1/10, ) plt.grid(True) # plt.show() savefig(output_file)
def plot_location_plots_with_zones(ltuo_model_id_and_hashtag_tag, no_of_bins_for_influence_score=100): output_file_format = fld_results+'/%s_%s.png' for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: no_of_zones, ltuo_location_and_influence_score_and_zone_id = \ Experiments.get_location_with_zone_ids(model_id, hashtag_tag) locations, influence_scores, zone_ids = zip(*ltuo_location_and_influence_score_and_zone_id) # print len(locations) # print [zone_id for _, _, zone_id in sorted(zip(locations, influence_scores, zone_ids), key=itemgetter(1))] # exit() # Plot influence plot ltuo_location_and_global_influence_score = zip(locations, influence_scores) max_y_tick = InfluenceAnalysis._plot_scores(ltuo_location_and_global_influence_score, [], no_of_bins_for_influence_score, smooth=True) # Plot zones ltuo_influence_score_and_zone_id = zip(influence_scores, zone_ids) ltuo_zone_id_and_influence_scores = [(zone_id, zip(*ito_tuo_influence_score_and_zone_id)[0]) for zone_id, ito_tuo_influence_score_and_zone_id in groupby( sorted(ltuo_influence_score_and_zone_id, key=itemgetter(1)), key=itemgetter(1) ) ] ltuo_zone_id_and_tuo_min_influence_score_and_max_influence_score = \ [(zone_id, (min(influence_scores), max(influence_scores))) for zone_id, influence_scores in ltuo_zone_id_and_influence_scores] ltuo_zone_id_and_tuo_box_start_and_box_width = \ [(zone_id, (min_influence_score, abs(min_influence_score-max_influence_score))) for zone_id, (min_influence_score, max_influence_score) in ltuo_zone_id_and_tuo_min_influence_score_and_max_influence_score ] zone_ids, ltuo_box_start_and_box_width = zip(*ltuo_zone_id_and_tuo_box_start_and_box_width) zone_colors = [GeneralMethods.getRandomColor() for zone_id in zone_ids] plt.broken_barh(ltuo_box_start_and_box_width , (0, max_y_tick), facecolors=zone_colors, alpha=0.25, lw=0) # temp_ltuo_box_start_and_box_width = [] # for box_start, box_width in ltuo_box_start_and_box_width: # if box_width!=0: temp_ltuo_box_start_and_box_width.append((box_start, box_width)) # else: temp_ltuo_box_start_and_box_width.append((box_start, 0.0001)) # zero_size_cluster_ltuo_box_start_and_box_width = [] # for box_start, box_width in ltuo_box_start_and_box_width: # if box_width==0: zero_size_cluster_ltuo_box_start_and_box_width.append((box_start, 0.0001)) # plt.broken_barh(zero_size_cluster_ltuo_box_start_and_box_width , (0, max_y_tick), facecolors='r', alpha=0.25, lw=0) # plt.xlim(xmin=-0.0025, xmax=0.0025) output_file = output_file_format%(GeneralMethods.get_method_id(), model_id, hashtag_tag) savefig(output_file)
def global_influence_plots(ltuo_model_id_and_hashtag_tag, no_of_bins_for_influence_score=100): marking_locations = [ '18.8500_-98.6000', # '2.9000_101.5000', '51.4750_0.0000', '33.3500_-118.1750', # '-23.2000_-46.4000', '-22.4750_-42.7750', '39.1500_-83.3750', '40.6000_-73.9500', '29.7250_-97.1500', '30.4500_-95.7000' ] for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: output_file = fld_results%(GeneralMethods.get_method_id()) + '%s_%s.png'%(model_id, hashtag_tag) tuo_location_and_global_influence_score = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag) InfluenceAnalysis._plot_scores(tuo_location_and_global_influence_score, marking_locations, no_of_bins_for_influence_score, smooth=True) plt.ylim(ymin=0.0) # plt.show() savefig(output_file)
def utm_ids_on_map(): ''' Plots utm ids on world map. The color indicates the log(total_hashtag_count) ''' output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'.png' ltuo_point_and_total_hashtag_count = [] for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, remove_params_dict=True): point = UTMConverter.getLatLongUTMIdInLatLongForm(utm_object['utm_id']) total_hashtag_count = log(utm_object['total_hashtag_count']) ltuo_point_and_total_hashtag_count.append((point, total_hashtag_count)) points, total_hashtag_counts = zip(*sorted(ltuo_point_and_total_hashtag_count, key=itemgetter(1))) plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=total_hashtag_counts, cmap=matplotlib.cm.cool, lw = 0, alpha=1.) savefig(output_file)
def plot_local_influencers(ltuo_model_id_and_hashtag_tag): tuples_of_boundary_and_boundary_label = [ ([[24.527135,-127.792969], [49.61071,-59.765625]], 'USA', GeneralMethods.getRandomColor()), ([[10.107706,-118.660469], [26.40009,-93.699531]], 'Mexico', GeneralMethods.getRandomColor()), ([[-16.6695,88.409841], [30.115057,119.698904]], 'SE-Asia', GeneralMethods.getRandomColor()), ([[-29.565473,-58.191719], [7.327985,-30.418282]], 'Brazil', GeneralMethods.getRandomColor()), ] for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: print model_id, hashtag_tag tuples_of_location_and_color = [] for boundary, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: tuo_location_and_influence_scores = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary) tuo_location_and_influence_scores = sorted(tuo_location_and_influence_scores, key=itemgetter(1))[:10] locations = zip(*tuo_location_and_influence_scores)[0] for location in locations: tuples_of_location_and_color.append([getLocationFromLid(location.replace('_', ' ')), boundary_color]) locations, colors = zip(*tuples_of_location_and_color) plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0, alpha=1.) for _, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: plt.scatter([0], [0], label=boundary_label, c=boundary_color, lw = 0) plt.legend(loc=3, ncol=4, mode="expand",) # plt.show() savefig(fld_results%(GeneralMethods.get_method_id()) +'%s_%s.png'%(model_id, hashtag_tag))
def plot_locations_influence_on_world_map(ltuo_model_id_and_hashtag_tag, noOfInfluencers=10, percentage_of_locations=0.15): input_locations = [ ('40.6000_-73.9500', 'new_york'), ('33.3500_-118.1750', 'los_angeles'), ('29.7250_-97.1500', 'austin'), ('30.4500_-95.7000', 'college_station'), ('-22.4750_-42.7750', 'rio'), ('51.4750_0.0000', 'london'), ('-23.2000_-46.4000', 'sao_paulo') ] for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: tuo_location_and_tuo_neighbor_location_and_locations_influence_score = \ Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers=None, influence_type=InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE) for input_location, label in input_locations: for location, tuo_neighbor_location_and_locations_influence_score in \ tuo_location_and_tuo_neighbor_location_and_locations_influence_score: if input_location==location: input_location = getLocationFromLid(input_location.replace('_', ' ')) output_file = fld_results%GeneralMethods.get_method_id() + '/%s_%s/%s.png'%(model_id, hashtag_tag, label) number_of_outgoing_influences = int(len(tuo_neighbor_location_and_locations_influence_score)*percentage_of_locations) if number_of_outgoing_influences==0: number_of_outgoing_influences=len(tuo_neighbor_location_and_locations_influence_score) locations = zip(*tuo_neighbor_location_and_locations_influence_score)[0][:number_of_outgoing_influences] locations = [getLocationFromLid(location.replace('_', ' ')) for location in locations] # locations = filter(lambda location: isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY), locations) if locations: _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#000000', c='#FF00FF', returnBaseMapObject=True, lw = 0) # _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#FF00FF', returnBaseMapObject=True, lw = 0) for location in locations: # if isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY): m.drawgreatcircle(location[1], location[0], input_location[1], input_location[0], color='#FAA31B', lw=1., alpha=0.5) # plotPointsOnWorldMap([input_location], blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0) plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#000000', c='#003CFF', s=40, lw = 0) # plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0) FileIO.createDirectoryForFile(output_file) print output_file savefig(output_file) plt.clf() else: GeneralMethods.runCommand('rm -rf %s'%output_file) break
def fraction_of_occurrences_vs_rank_of_location(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' ltuo_location_and_occurrence_count = [] for location_object in\ FileIO.iterateJsonFromFile(f_dense_hashtag_distribution_in_locations, remove_params_dict=True): ltuo_location_and_occurrence_count.append([ location_object['location'], location_object['occurrences_count'] ]) # ltuo_location_and_occurrence_count.sort(key=itemgetter(1)) # for location, occurrence_count in ltuo_location_and_occurrence_count: # print location, occurrence_count # exit() total_occurrences = sum(zip(*ltuo_location_and_occurrence_count)[1]) + 0.0 ltuo_lid_and_r_occurrence_count = sorted(ltuo_location_and_occurrence_count, key=itemgetter(1), reverse=True) y_fraction_of_occurrences = [r_occurrence_count/total_occurrences for _, r_occurrence_count in ltuo_lid_and_r_occurrence_count] # total_locations = len(y_fraction_of_occurrences)+0. # x_percentage_of_locations = [x/total_locations for x in range(1,len(y_fraction_of_occurrences)+1)] x_percentage_of_locations = range(1,len(y_fraction_of_occurrences)+1) plt.figure(num=None, figsize=(6,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.semilogy(x_percentage_of_locations, y_fraction_of_occurrences, lw=0, marker='o', c='k') plt.ylabel('Fraction of occurrences') plt.xlabel('Locations ordered by their ranks') plt.grid(True) a = plt.axes([.55, .5, .3, .3]) # plt.plot(range(10)) plt.semilogy(x_percentage_of_locations, y_fraction_of_occurrences, lw=0, marker='o', c='k') # plt.title('Probability') plt.grid(True) yticks = plt.yticks() plt.yticks([yticks[0][-1], yticks[0][0]]) # plt.ylim(ymin=0.000001, ymax=0.15) # plt.ylim(ymin=-0.01, ymax=0.04) plt.xlim(xmin=-4, xmax=200) plt.setp(a) # plt.show() savefig(output_file)
def example_for_caverlee(): # valid_locations = ['18T_585E_4512N', '18T_587E_4514N'] mf_lid_to_location = dict([ ('18T_585E_4512N', 'Times Square'), ('18T_587E_4514N', 'Central Park'), ('18T_584E_4511N', 'Penn Station'), ('18T_585E_4511N', 'Empire State Building'), ]) output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' subplot_num = 221 # plt.figure(num=None, figsize=(6,3)) for data in FileIO.iterateJsonFromFile(f_example_for_caverlee, remove_params_dict=True): location = data['location'] if location in mf_lid_to_location: td = timedelta(hours=-5) ltuo_occ_time_and_count = data['ltuo_occ_time_and_count'] ltuo_occ_time_and_count.sort(key=itemgetter(0)) occ_times, counts = zip(*ltuo_occ_time_and_count) occ_times = map(datetime.fromtimestamp, occ_times) occ_times = map(lambda d: d+td, occ_times) occ_hours = map(lambda d: d.hour, occ_times) ltuo_occ_hour_and_count = zip(occ_hours, counts) ltuo_occ_hour_and_count = [(h, sum(zip(*h_c)[1])) for h, h_c in GeneralMethods.group_items_by(ltuo_occ_hour_and_count, key=itemgetter(0))] occ_hours, counts = zip(*ltuo_occ_hour_and_count) total_counts = sum(counts)+0.0 counts = map(lambda c: c/total_counts, counts) plt.subplot(subplot_num) # plt.subplots_adjust(bottom=0.2, top=0.9) subplot_num+=1 plt.plot(occ_hours, counts, color='#EA00FF', lw=1) plt.fill_between(occ_hours, counts, color='#EA00FF', alpha=0.25) # plt.ylabel('% of tweets') plt.xlabel('Time of day') plt.xlim(xmax=23) plt.ylim(ymax=0.09) plot_anchored_text(mf_lid_to_location[location], loc=2) plt.grid(True) # savefig(output_file_format%mf_lid_to_location[location].replace(' ', '_')) savefig(output_file_format%'ny_locations')
def plot_correlation_between_influence_similarity_and_jaccard_similarity(model_ids): for model_id in model_ids: mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities = {} for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \ enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)): print line_count for neighbor_location, mf_influence_type_to_similarity in \ tuo_neighbor_location_and_mf_influence_type_and_similarity: jaccard_similarity = round(mf_influence_type_to_similarity[JACCARD_SIMILARITY], 1) for influence_type in \ [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]: if influence_type not in mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities: mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type] = defaultdict(list) mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type][jaccard_similarity]\ .append(mf_influence_type_to_similarity[influence_type]) subplot_id = 211 for influence_type, mf_jaccard_similarity_to_influence_similarities in \ mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities.iteritems(): plt.subplot(subplot_id) x_jaccard_similarities, y_influence_similarities = [], [] for jaccard_similarity, influence_similarities in \ sorted(mf_jaccard_similarity_to_influence_similarities.iteritems(), key=itemgetter(0)): influence_similarities=filter_outliers(influence_similarities) if len(influence_similarities) > 10: x_jaccard_similarities.append(jaccard_similarity) y_influence_similarities.append(np.mean(influence_similarities)) rho, p_value = pearsonr(x_jaccard_similarities, y_influence_similarities) plt.scatter(x_jaccard_similarities, y_influence_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=0, s=40) plt.plot(x_jaccard_similarities, y_influence_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=2) if influence_type==InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE: plt.ylabel('Influencing locations similarity', fontsize=13) else: plt.ylabel('Influenced locations similarity', fontsize=13) subplot_id+=1 plt.xlabel('Jaccard similarity', fontsize=13) savefig('images/%s.png'%GeneralMethods.get_method_id())
def locations_at_top_and_bottom(model_ids, no_of_locations=5): for model_id in model_ids: output_file_format = analysis_folder+'%s/'%(GeneralMethods.get_method_id())+'%s/%s.json' input_locations = [ # ('40.6000_-73.9500', 'new_york'), ('30.4500_-95.7000', 'college_station'), ] tuo_location_and_tuo_neighbor_location_and_influence_score = \ Experiments.load_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(model_id) for input_location, label in input_locations: for location, tuo_neighbor_location_and_influence_score in \ tuo_location_and_tuo_neighbor_location_and_influence_score: if input_location==location: output_file = output_file_format%(input_location, model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) FileIO.createDirectoryForFile(output_file) FileIO.writeToFileAsJson("Bottom:", output_file) for neighbor_location_and_influence_score in tuo_neighbor_location_and_influence_score[:no_of_locations]: FileIO.writeToFileAsJson(neighbor_location_and_influence_score+[''], output_file) FileIO.writeToFileAsJson("Top:", output_file) for neighbor_location_and_influence_score in \ reversed(tuo_neighbor_location_and_influence_score[-no_of_locations:]): FileIO.writeToFileAsJson(neighbor_location_and_influence_score+[''], output_file)
def utm_object_analysis(): ltuo_utm_id_and_num_of_neighbors_and_mean_common_h_count = [] output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'.df' so_valid_utm_ids = set() for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): so_valid_utm_ids.add(utm_object['utm_id']) for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): so_valid_nei_utm_ids = set(utm_object['mf_nei_utm_id_to_common_h_count']).intersection(so_valid_utm_ids) mean_num_of_common_h_count = np.mean([utm_object['mf_nei_utm_id_to_common_h_count'][nei_utm_id] for nei_utm_id in so_valid_nei_utm_ids]) ltuo_utm_id_and_num_of_neighbors_and_mean_common_h_count.append([utm_object['utm_id'], len(so_valid_nei_utm_ids), mean_num_of_common_h_count]) utm_ids, num_of_neighbors, mean_common_h_count = zip(*ltuo_utm_id_and_num_of_neighbors_and_mean_common_h_count) od = rlc.OrdDict([ ('utm_ids', robjects.StrVector(utm_ids)), ('num_of_neighbors', robjects.FloatVector(num_of_neighbors)), ('mean_common_h_count', robjects.FloatVector(mean_common_h_count)) ]) df = robjects.DataFrame(od) FileIO.createDirectoryForFile(output_file) print 'Saving df to: ', output_file df.to_csvfile(output_file)
def spatial_metrics_vs_occurrence_count(): output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' def plot_graph(ltuo_locality_measure_and_occurrences_count, id): mf_normalized_occurrences_count_to_locality_measures = defaultdict(list) for locality_measure, occurrences_count in \ ltuo_locality_measure_and_occurrences_count: normalized_occurrence_count =\ int(occurrences_count/ACCURACY_NO_OF_OCCURRANCES)*ACCURACY_NO_OF_OCCURRANCES+ACCURACY_NO_OF_OCCURRANCES mf_normalized_occurrences_count_to_locality_measures[normalized_occurrence_count].append( locality_measure ) x_occurrance_counts, y_locality_measures = [], [] for k in sorted(mf_normalized_occurrences_count_to_locality_measures): if len(mf_normalized_occurrences_count_to_locality_measures[k]) > 10: x_occurrance_counts.append(k), y_locality_measures.append( np.mean(mf_normalized_occurrences_count_to_locality_measures[k]) ) x_occurrance_counts = [x/1000. for x in x_occurrance_counts] plt.figure(num=None, figsize=(4.3,3.0)) plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15, wspace=0.) plt.scatter(x_occurrance_counts, y_locality_measures, lw=0, marker='o', c='k', s=50) plt.xlabel('Hashtag occurrences in thousands') plt.ylabel('Mean hashtag %s'%id) plt.grid(True) savefig(output_file_format%('locality_vs_occurrences_'+id)) ACCURACY_NO_OF_OCCURRANCES = 25 # import matplotlib as mpl # mpl.rcParams['text.usetex']=True data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] ltuo_entropy_and_occurrences_count = map(itemgetter('entropy', 'num_of_occurrenes'), data) ltuo_focus_and_occurrences_count = map(itemgetter('focus', 'num_of_occurrenes'), data) ltuo_focus_and_occurrences_count = [(s, c) for ((_, s), c) in ltuo_focus_and_occurrences_count] ltuo_coverage_and_occurrences_count = map(itemgetter('spread', 'num_of_occurrenes'), data) plot_graph(ltuo_entropy_and_occurrences_count, 'entropy') plot_graph(ltuo_focus_and_occurrences_count, 'focus') plot_graph(ltuo_coverage_and_occurrences_count, 'spread')
def plot_correlation_between_influence_similarity_and_distance(model_ids, distance_accuracy=500): def get_larger_lid(lid): return getLatticeLid(getLocationFromLid(lid.replace('_', ' ')), 10) for model_id in model_ids: mf_influence_type_to_tuo_distance_and_similarity = defaultdict(list) for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \ enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)): print line_count for neighbor_location, mf_influence_type_to_similarity in \ tuo_neighbor_location_and_mf_influence_type_and_similarity: distance = getHaversineDistance(getLocationFromLid(location.replace('_', ' ')), getLocationFromLid(neighbor_location.replace('_', ' '))) distance = int(distance)/distance_accuracy*distance_accuracy + distance_accuracy for influence_type, similarity in mf_influence_type_to_similarity.iteritems(): mf_influence_type_to_tuo_distance_and_similarity[influence_type].append([distance, similarity]) subpot_id = 211 for influence_type in \ [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]: tuo_distance_and_similarity = mf_influence_type_to_tuo_distance_and_similarity[influence_type] tuo_distance_and_similarities = [(distance, zip(*ito_tuo_distance_and_similarity)[1]) for distance, ito_tuo_distance_and_similarity in groupby( sorted(tuo_distance_and_similarity, key=itemgetter(0)), key=itemgetter(0) ) ] plt.subplot(subpot_id) x_distances, y_similarities = [], [] for distance, similarities in tuo_distance_and_similarities: # similarities=filter_outliers(similarities) x_distances.append(distance), y_similarities.append(np.mean(similarities)) # x_distances, y_similarities = splineSmooth(x_distances, y_similarities) plt.semilogy(x_distances, y_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=2, marker = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['marker']) plt.ylabel(InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['label'], fontsize=13) subpot_id+=1 plt.xlabel('Distance (Miles)', fontsize=13) # plt.show() savefig('images/%s.png'%(GeneralMethods.get_method_id()))
def norm_iid_vs_locality_measuers(): TIME_UNIT_IN_SECONDS = 10.*60. output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' ltuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage = \ [data for data in FileIO.iterateJsonFromFile(f_norm_iid_spatial_metrics, remove_params_dict=True)] x_normalized_iids, y_entropies, y_focuses, y_distance_from_overall_entropy, y_distance_from_overall_focus, y_coverages = \ zip(*sorted([(data[0]*TIME_UNIT_IN_SECONDS/60, data[1][1], data[1][2], data[1][4], data[1][5], data[1][3]) for data in ltuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage ]) ) plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.subplot(111) plt.xlim(xmin=-20, xmax=200) # plt.ylim(ymin=0.5, ymax=1.0) plt.plot(x_normalized_iids, y_coverages, lw=1, c='k') plt.scatter(x_normalized_iids, y_coverages, lw=0, marker='o', s=50, c='k') plt.ylabel('Interval coverage') plt.xlabel('Minutes since peak') plt.grid(True) savefig(output_file_format%'coverage') plt.clf() plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.subplot(111) plt.xlim(xmin=-20, xmax=120) plt.ylim(ymin=0.55, ymax=0.70) plt.plot(x_normalized_iids, y_entropies, lw=1, c='k') plt.scatter(x_normalized_iids, y_entropies, lw=0, marker='o', s=50, c='k') plt.ylabel('Interval entropy') plt.xlabel('Minutes since peak') plt.grid(True) savefig(output_file_format%'entropy') plt.clf() plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.subplot(111) plt.xlim(xmin=-20, xmax=400) # plt.ylim(ymin=1, ymax=3) plt.plot(x_normalized_iids, y_distance_from_overall_entropy, lw=1, c='k') plt.scatter(x_normalized_iids, y_distance_from_overall_entropy, marker='o', s=50, c='k') plt.xlabel('Minutes since peak') plt.ylabel('Distance from overall entropy') plt.grid(True) savefig(output_file_format%'distace_from_overall_entropy') plt.clf() plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.subplot(111) plt.xlim(xmin=-20, xmax=120) plt.ylim(ymin=0.797, ymax=0.84) plt.plot(x_normalized_iids, y_focuses, lw=1, c='k') plt.scatter(x_normalized_iids, y_focuses, lw=1, marker='o', s=50, c='k') plt.xlabel('Minutes since peak') plt.ylabel('Interval focus') plt.grid(True) savefig(output_file_format%'focus') plt.clf() plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9) plt.subplot(111) plt.xlim(xmin=-20, xmax=400) # plt.ylim(ymin=-0.43, ymax=-0.19) plt.plot(x_normalized_iids, y_distance_from_overall_focus, lw=1, c='k') plt.scatter(x_normalized_iids, y_distance_from_overall_focus, marker='o', s=50, c='k') plt.xlabel('Minutes since peak') plt.ylabel('Distance from overall focus') plt.grid(True) savefig(output_file_format%'distace_from_overall_focus')