def reducer2(self, utm_id, it_utm_id_and_hashtags): ltuo_neighbor_utm_id_and_neighbor_hashtags = [] hashtags = None for neighbor_utm_id, neighbor_hashtags in it_utm_id_and_hashtags: if neighbor_utm_id == utm_id: hashtags = set(neighbor_hashtags) elif utm_id<neighbor_utm_id: ltuo_neighbor_utm_id_and_neighbor_hashtags.append([neighbor_utm_id, set(neighbor_hashtags)]) if hashtags: for neighbor_utm_id, neighbor_hashtags in ltuo_neighbor_utm_id_and_neighbor_hashtags: num_common_hashtags = len(hashtags.intersection(neighbor_hashtags))+0.0 total_hashtags = len(hashtags.union(neighbor_hashtags)) if num_common_hashtags/total_hashtags >= 0.10: observed_hashtag_pattern = [1 for i in range(num_common_hashtags)] +\ [0 for i in range(total_hashtags - num_common_hashtags)] mean_probability = np.mean([ MonteCarloSimulation.mean_probability( MonteCarloSimulation.probability_of_data_extracted_from_same_sample, observed_hashtag_pattern, [random.sample([0,1], 1)[0] for i in range(total_hashtags)] ) for i in range(3)]) # print utm_id, neighbor_utm_id # print observed_hashtag_pattern, mean_probability # print [random.sample([0,1], 1)[0] for i in range(total_hashtags)] if mean_probability <= 0.05: yield '', { 'utm_id': utm_id, 'neighbor_utm_id': neighbor_utm_id, 'mean_probability':mean_probability, 'num_common_hashtags': num_common_hashtags }
def reducer_with_monte_carlo_simulation(self, location_pair, it_propagation_statuses): propagation_statuses = list(chain(*it_propagation_statuses)) for min_common_hashtag in ImpactOfUsingLocationsToPredict.MIN_COMMON_HASHTAGS: if len(propagation_statuses) > min_common_hashtag: mean_probability = MonteCarloSimulation.mean_probability( MonteCarloSimulation.probability_of_data_extracted_from_same_sample, propagation_statuses, [random.sample([ ImpactOfUsingLocationsToPredict.STATUS_BEFORE, ImpactOfUsingLocationsToPredict.STATUS_AFTER ], 1)[0] for i in range(len(propagation_statuses))] ) yield min_common_hashtag, { 'location_pair': location_pair, 'mean_probability': mean_probability, 'len_propagation_statuses': len(propagation_statuses), 'propagation_statuses': np.mean(propagation_statuses) } else: break