def get_HD_from_UTMLL(u1, u2, radius=earthRadiusKMs): lat1, long1, _ = u1.split('_') corr1 = (float(lat1), float(long1)) lat2, long2, _ = u2.split('_') corr2 = (float(lat2), float(long2)) dist = getHaversineDistance(corr1, corr2, radius) return dist
def getHDfromUTMLL(u1,u2, radius): lat1,long1,_=u1.split('_') corr1=(float(lat1),float(long1)) lat2,long2,_=u2.split('_') corr2=(float(lat2),float(long2)) dist=getHaversineDistance(corr1,corr2,radius) return dist
def map_line_to_user(self, key, line): if False: yield for user, creator in ReadFile.read_json_yield_user(line): dist = getHaversineDistance([creator[1], creator[2]], [self.query_lat, self.query_lng]) s = (self.dmin / (dist + self.dmin))**1.01 yield user, [s, creator[3], creator[4]]
def addLocationToLocationDistanceToDB(): i = 0 for data in locationGraphIterator(): try: d = map(float, data['e'].split()) d = getHaversineDistance(d[0:2],d[2:]) locationToLocationCollection.insert({'_id': data['e'], 'u': data['w'], 'd': d}) except Exception as e: print i, 'Exception while processing:', data; i+=1
def read_json_yield_uid1(line): line = cjson.decode(line) user_id = line['user_id'] tags = line['tag'] lat1 = line['list_creator_lat'] lng1 = line['list_creator_lng'] user_lat = line['user_lat'] user_lng = line['user_lng'] distance = getHaversineDistance([lat1, lng1], [user_lat, user_lng]) for tag in set(tags): yield user_id, [tag, distance]
def read_json_yield_uid1(line): line = cjson.decode(line) user_id=line['user_id'] tags=line['tag'] lat1=line['list_creator_lat'] lng1=line['list_creator_lng'] user_lat=line['user_lat'] user_lng=line['user_lng'] distance=getHaversineDistance([lat1,lng1],[user_lat,user_lng]) for tag in set(tags): yield user_id,[tag,distance]
def read_json_yield_uid2(line): line = cjson.decode(line) user_id = line['user_id'] tags = line['tag'] lat1 = line['list_creator_lat'] lng1 = line['list_creator_lng'] user_lat = line['user_lat'] user_lng = line['user_lng'] distance = getHaversineDistance(target_loc, [user_lat, user_lng]) if distance <= 10: distance = 0 elif distance > 10: distance -= 10 for tag in set(tags): yield user_id, [tag, distance]
def read_json_yield_uid2(line): line = cjson.decode(line) user_id=line['user_id'] tags=line['tag'] lat1=line['list_creator_lat'] lng1=line['list_creator_lng'] user_lat=line['user_lat'] user_lng=line['user_lng'] distance=getHaversineDistance(target_loc,[user_lat,user_lng]) if distance <= 10: distance=0 elif distance > 10: distance-=10 for tag in set(tags): yield user_id,[tag,distance]
def getLocalityIndexAtK(occurances, kValue): ''' Locality index at k - for a hashtag is the minimum radius that covers k percentage of occurrances. A high locality index suggests hashtag was global with a small index suggests it was local. To find locality index at k, I must find a point that is closest to k percentage of occurances. Brute force requires nC2 complexity. Hence, use lattices of bigger size technique. ''' def getLatticeThatGivesMinimumLocalityIndexAtK(): occurancesDict = {'occurances': occurances} for accuracy in [4, 2, 1, 0.5, ACCURACY]: occurancesDict = getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurancesDict['occurances'], accuracy) return occurancesDict['sourceLattice'] def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy): occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc) higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True) for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' '))) for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1)) occurancesToReturn = [] currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances)) for hl, occs in higherLattices: higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl} while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5: (l, d) = distanceMatrix[hl][0]; distanceMatrix[hl]=distanceMatrix[hl][1:] higherLatticeSet['distance']+=d higherLatticeSet['lattices'].append(l) higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l]) if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l] # return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} occurancesDistributionInHigherLattice = defaultdict(int) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, ACCURACY)]+=1 totalOccurances, distance, observedOccuraces = float(len(occurances)), 0, 0 lattice = getLatticeThatGivesMinimumLocalityIndexAtK() sortedLatticeObjects = sorted([(getLocationFromLid(k.replace('_', ' ')), getHaversineDistance(lattice, getLocationFromLid(k.replace('_', ' '))), v) for k, v in occurancesDistributionInHigherLattice.iteritems()], key=itemgetter(1)) for l, d, oc in sortedLatticeObjects: distance=d; observedOccuraces+=oc if observedOccuraces/totalOccurances>=kValue: break return (d, lattice)
def plot_correlation_between_influence_similarity_and_distance(model_ids, distance_accuracy=500): def get_larger_lid(lid): return getLatticeLid(getLocationFromLid(lid.replace('_', ' ')), 10) for model_id in model_ids: mf_influence_type_to_tuo_distance_and_similarity = defaultdict(list) for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \ enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)): print line_count for neighbor_location, mf_influence_type_to_similarity in \ tuo_neighbor_location_and_mf_influence_type_and_similarity: distance = getHaversineDistance(getLocationFromLid(location.replace('_', ' ')), getLocationFromLid(neighbor_location.replace('_', ' '))) distance = int(distance)/distance_accuracy*distance_accuracy + distance_accuracy for influence_type, similarity in mf_influence_type_to_similarity.iteritems(): mf_influence_type_to_tuo_distance_and_similarity[influence_type].append([distance, similarity]) subpot_id = 211 for influence_type in \ [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]: tuo_distance_and_similarity = mf_influence_type_to_tuo_distance_and_similarity[influence_type] tuo_distance_and_similarities = [(distance, zip(*ito_tuo_distance_and_similarity)[1]) for distance, ito_tuo_distance_and_similarity in groupby( sorted(tuo_distance_and_similarity, key=itemgetter(0)), key=itemgetter(0) ) ] plt.subplot(subpot_id) x_distances, y_similarities = [], [] for distance, similarities in tuo_distance_and_similarities: # similarities=filter_outliers(similarities) x_distances.append(distance), y_similarities.append(np.mean(similarities)) # x_distances, y_similarities = splineSmooth(x_distances, y_similarities) plt.semilogy(x_distances, y_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], lw=2, marker = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['marker']) plt.ylabel(InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['label'], fontsize=13) subpot_id+=1 plt.xlabel('Distance (Miles)', fontsize=13) # plt.show() savefig('images/%s.png'%(GeneralMethods.get_method_id()))
def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy): occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc) higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True) for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' '))) for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1)) occurancesToReturn = [] currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances)) for hl, occs in higherLattices: higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl} while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5: (l, d) = distanceMatrix[hl][0]; distanceMatrix[hl]=distanceMatrix[hl][1:] higherLatticeSet['distance']+=d higherLatticeSet['lattices'].append(l) higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l]) if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l] # return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
def getMeanDistanceFromSource(source, llids): return np.mean([getHaversineDistance(source, p) for p in llids]) def getLocalityIndexAtK(occurances, kValue):
def getRadius(locations): meanLid = getCenterOfMass(locations,accuracy=LATTICE_ACCURACY) distances = [getHaversineDistance(meanLid, p) for p in locations] _, upperBoundForDistance = getOutliersRangeUsingIRQ(distances) return np.mean(filter(lambda d: d<=upperBoundForDistance, distances))
dist=getHaversineDistance(corr1,corr2,radius) return dist earthRadiusMiles = 3958.761 outfile='/spare/wei/folk/dallas_tagging' #outfile1='/spare/wei/folk/dist_greater_than_50_less_than_500_2' #outfile2='/spare/wei/folk/dist_greater_than_500_less_than_3000_2' #outfile3='/spare/wei/folk/dist_greater_than_3000_2' outfile=open(outfile,'w') #outfile1=open(outfile1,'w') #outfile2=open(outfile2,'w') #outfile3=open(outfile3,'w') for line in open(infile,'r'): line = cjson.decode(line) lat_u,lng_u=line['user_lat'],line['user_lng'] lat_c,lng_c = 32.78014,-96.800451 # lat_c,lng_c=line['list_creator_lat'],line['list_creator_lng'] # lat_c,lng_c = 40.705631,-73.978003 # lat_c,lng_c = 37.77493,-122.419416 # lat_c,lng_c=29.760193,-95.36939 # lat_c,lng_c = 30.627977,-96.334407 dist = getHaversineDistance([lat_u,lng_u],[lat_c,lng_c],earthRadiusMiles) if dist <= 20: outfile.write(cjson.encode(line)+'\n') # elif dist>50 and dist<500: # outfile1.write(cjson.encode(line)+'\n') # elif dist>500 and dist<3000: # outfile2.write(cjson.encode(line)+'\n') # else: # outfile3.write(cjson.encode(line)+'\n')
def _haversine_distance(self, location, neighbor_location): loc_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(location) nei_loc_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(neighbor_location) return getHaversineDistance(loc_lat_long, nei_loc_lat_long)
def map_line_to_user(self,key,line): if False:yield for user,creator in ReadFile.read_json_yield_user(line): dist=getHaversineDistance([creator[1],creator[2]],[self.query_lat,self.query_lng]) s=(self.dmin/(dist+self.dmin))**1.01 yield user,[s,creator[3],creator[4]]