def mapper(self, key, value): if False: yield # I'm a generator! hashtag_object = cjson.decode(value) if 'num_of_occurrences' in hashtag_object and\ hashtag_object['num_of_occurrences'] >= MIN_HASHTAG_OCCURRENCES_FOR_PROPAGATION_ANALYSIS: ltuo_bucket_occ_time_and_occ_utm_id =\ map( lambda (t, utm_id): (GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS), utm_id), hashtag_object['ltuo_occ_time_and_occ_utm_id'] ) ltuo_bucket_occ_time_and_occ_utm_id.sort(key=itemgetter(1)) ltuo_utm_id_and_bucket_occ_times =\ [ (occ_utm_id,map(itemgetter(0), it_bucket_occ_time_and_occ_utm_id)) for occ_utm_id, it_bucket_occ_time_and_occ_utm_id in groupby(ltuo_bucket_occ_time_and_occ_utm_id, key=itemgetter(1)) ] ltuo_utm_id_and_bucket_occ_times =\ filter( lambda (_, occ_times): len(occ_times)>10, ltuo_utm_id_and_bucket_occ_times ) for _, bucket_occ_times in ltuo_utm_id_and_bucket_occ_times: gap_perct = 0.05 gaps = np.arange(gap_perct,1+gap_perct,gap_perct) bucket_occ_times = filter_outliers(bucket_occ_times) bucket_occ_times_at_gaps = get_items_at_gap(bucket_occ_times, gap_perct) start_time = float(bucket_occ_times_at_gaps[0]) life_time = bucket_occ_times_at_gaps[-1] - start_time if life_time>0: norm_num_of_occurrences =\ map(lambda t: int(((t-start_time)/life_time)*100), bucket_occ_times_at_gaps) for gap, norm_num_of_occurrence in zip(gaps, norm_num_of_occurrences): self.mf_gap_to_norm_num_of_occurrences['%0.2f'%gap]+=norm_num_of_occurrence
def mapper1(self, key, hashtag_object): if False: yield hashtag = hashtag_object['hashtag'] ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] ltuo_location_and_items = GeneralMethods.group_items_by(ltuo_occ_time_and_occ_location, key=itemgetter(1)) ltuo_location_and_occurrence_time =\ [(location, min(items, key=itemgetter(0))[0])for location, items in ltuo_location_and_items] ltuo_location_and_occurrence_time = [( location, GeneralMethods.approximateEpoch(occurrence_time, TIME_UNIT_IN_SECONDS) ) for location, occurrence_time in ltuo_location_and_occurrence_time] if ltuo_location_and_occurrence_time: occurrence_times = filter_outliers(zip(*ltuo_location_and_occurrence_time)[1]) ltuo_location_and_occurrence_time =\ filter(lambda (l, o): o in occurrence_times, ltuo_location_and_occurrence_time) for location, occurrence_time in ltuo_location_and_occurrence_time: self.mf_location_to_ltuo_hashtag_and_min_occ_time[location].append([hashtag, occurrence_time]) for neighbor_location, _ in ltuo_location_and_occurrence_time: if location!=neighbor_location: self.mf_location_to_neighbor_locations[location].add(neighbor_location)
def plot_correlation_between_influence_similarity_and_jaccard_similarity(model_ids): for model_id in model_ids: mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities = {} for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \ enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)): print line_count for neighbor_location, mf_influence_type_to_similarity in \ tuo_neighbor_location_and_mf_influence_type_and_similarity: jaccard_similarity = round(mf_influence_type_to_similarity[JACCARD_SIMILARITY], 1) for influence_type in \ [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]: if influence_type not in mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities: mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type] = defaultdict(list) mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type][jaccard_similarity]\ .append(mf_influence_type_to_similarity[influence_type]) subplot_id = 211 for influence_type, mf_jaccard_similarity_to_influence_similarities in \ mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities.iteritems(): plt.subplot(subplot_id) x_jaccard_similarities, y_influence_similarities = [], [] for jaccard_similarity, influence_similarities in \ sorted(mf_jaccard_similarity_to_influence_similarities.iteritems(), key=itemgetter(0)): influence_similarities=filter_outliers(influence_similarities) if len(influence_similarities) > 10: x_jaccard_similarities.append(jaccard_similarity) y_influence_similarities.append(np.mean(influence_similarities)) rho, p_value = pearsonr(x_jaccard_similarities, y_influence_similarities) plt.scatter(x_jaccard_similarities, y_influence_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=0, s=40) plt.plot(x_jaccard_similarities, y_influence_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=2) if influence_type==InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE: plt.ylabel('Influencing locations similarity', fontsize=13) else: plt.ylabel('Influenced locations similarity', fontsize=13) subplot_id+=1 plt.xlabel('Jaccard similarity', fontsize=13) savefig('images/%s.png'%GeneralMethods.get_method_id())
def mapper(self, key, line): if False: yield # I'm a generator! hashtag_object = cjson.decode(line) if 'num_of_occurrences' in hashtag_object and\ hashtag_object['num_of_occurrences'] >= MIN_HASHTAG_OCCURRENCES_FOR_PROPAGATION_ANALYSIS: ltuo_occ_time_and_occ_utm_id = hashtag_object['ltuo_occ_time_and_occ_utm_id'] ltuo_occ_time_and_occ_utm_id.sort(key=itemgetter(1)) ltuo_occ_utm_id_and_occ_times =\ [ (occ_utm_id,map(itemgetter(0), it_occ_time_and_occ_utm_id)) for occ_utm_id, it_occ_time_and_occ_utm_id in groupby(ltuo_occ_time_and_occ_utm_id, key=itemgetter(1)) ] ltuo_occ_utm_id_and_occ_times = filter( lambda (_, occ_times): len(occ_times)>MIN_OCCURRENCES_PER_UTM_ID, ltuo_occ_utm_id_and_occ_times ) for occ_utm_id, occ_times in ltuo_occ_utm_id_and_occ_times: occ_times.sort() occ_times = filter_outliers(occ_times) lifespan = occ_times[-1] - occ_times[0] if lifespan > 0.0: occ_times_at_gap_perct = get_items_at_gap(occ_times, GAP_PERCT_FOR_PROPAGATION_ANALYSIS) ltuo_perct_and_occ_time = [ (int(( GAP_PERCT_FOR_PROPAGATION_ANALYSIS*i+\ GAP_PERCT_FOR_PROPAGATION_ANALYSIS)*100), j) for i, j in enumerate(occ_times_at_gap_perct) ] for perct1, occ_time1 in ltuo_perct_and_occ_time: for perct2, occ_time2 in ltuo_perct_and_occ_time: perct_pair = '%s_%s'%(perct1, perct2) if perct2>perct1: self.mf_perct_pair_to_time_differences[perct_pair].append( max(occ_time2-occ_time1, 0.0) ) else: self.mf_perct_pair_to_time_differences[perct_pair] = [0.0]
def _plot_affinities(type): # TIME_UNIT_IN_SECONDS = 60*10 mf_distance_to_affinity_scores = defaultdict(list) for similarity_and_lag_object in\ FileIO.iterateJsonFromFile(f_dense_hashtags_similarity_and_lag, remove_params_dict=True): distance=int(similarity_and_lag_object['haversine_distance']/100)*100+100 mf_distance_to_affinity_scores[distance].append(similarity_and_lag_object[type]) ltuo_distance_and_num_samples = [(distance, affinity_scores) for distance, affinity_scores in mf_distance_to_affinity_scores.iteritems()] ltuo_distance_and_num_samples.sort(key=itemgetter(0)) # for distance, num_samples in ltuo_distance_and_num_samples: # print distance, len(num_samples), np.mean(num_samples), np.mean(filter_outliers(num_samples)) # exit() ltuo_distance_and_affinity_score = [(distance, np.mean(filter_outliers(affinity_scores))) for distance, affinity_scores in mf_distance_to_affinity_scores.iteritems() if len(affinity_scores)>100] x_distances, y_affinity_scores = zip(*sorted(ltuo_distance_and_affinity_score, key=itemgetter(0))) if type=='adoption_lag': y_affinity_scores = [y/(60.*60.*60) for y in y_affinity_scores] plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) x_distances, y_affinity_scores = splineSmooth(x_distances, y_affinity_scores) plt.semilogx(x_distances, y_affinity_scores, c='k', lw=2) plt.xlim(xmin=95, xmax=15000) plt.grid(True)
def reducer(self, perct_pair, it_time_differences): time_differences = list(chain(*it_time_differences)) time_differences = filter_outliers(time_differences) yield perct_pair, {'perct_pair': perct_pair, 'time_differences': np.mean(time_differences)}