def collapse_rows(rows, min_locs, dispersion_threshold): ''' Enable us to take the output from a spark sql query and simultaneously extract the estimated locations and contact counts Args: rows (pypsark Row): output of spark sql query min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user dispersion_threshold (int) : A distance threhold on the dispersion of the estimated location for a user. We consider those estimated points with dispersion greater than the treshold unable to be predicted given how dispersed the tweet distances are from one another. Returns: ''' mention_count = defaultdict(int) geo_coords = [] for row in rows: # Aggregate geo locations to calcuate median position if row.geo is not None: lat,lon = row.geo geo_coords.append(LocEstimate(GeoCoord(lat,lon), None, None)) elif row.place is not None and row.place_type in ['city', 'neighborhood', 'poi']: lat,lon = bb_center(row.place) geo_coords.append(LocEstimate(GeoCoord(lat,lon), None, None)) # Count user @mentions if len(row.mentions) > 0: for mention in row.mentions: mention_count[long(mention)] += 1 if len(geo_coords) >= min_locs: loc_estimate = median(haversine, geo_coords) if dispersion_threshold is not None and loc_estimate.dispersion > dispersion_threshold: loc_estimate = None else: loc_estimate = None return (loc_estimate, mention_count)
def tokenize(inputRow, fields=set(['text'])): """Initial stand in attempt at tokenizing strings Params: inputRow: a pyspark row Output: (location, tokens): a tuple of the location of the tweet and a list of tokens in the tweet """ # Allow us to select which fields get pulled for model text = [] if 'text' in fields: text.append(inputRow.text.strip()) if 'user.location' in fields: try: text.append(inputRow.location.strip()) except: text.append(inputRow.user.location.strip()) text = ' '.join(text) # Get true location if inputRow.geo and inputRow.geo.type == 'Point': location = inputRow.geo.coordinates elif inputRow.place and inputRow.place.bounding_box and inputRow.place.bounding_box.type =='Polygon' \ and inputRow.place.place_type in ['city','poi','neighborhood']: location = bb_center(inputRow.place.bounding_box.coordinates) else: location = None if 'text' in fields: # Clean up URLs in tweet updates_to_make = [] if inputRow.entities and inputRow.entities.urls: for url_row in inputRow.entities.urls: updates_to_make.append( (url_row.url, urlparse.urlparse( url_row.expanded_url).netloc.replace('.', '_'))) if inputRow.extended_entities and inputRow.extended_entities.media: for media_row in inputRow.extended_entities.media: updates_to_make.append( (media_row.url, urlparse.urlparse( media_row.expanded_url).netloc.replace('.', '_'))) for (original, new_string) in updates_to_make: #print(original, new_string) text = text.replace(original, new_string) # Convert to lowercase and get remove @mentions tokens = [] for item in text.lower().split(): if not item.startswith('@'): tokens.append(item) return (location, tokens)
def tokenize(inputRow, fields=set(['text'])): """Initial stand in attempt at tokenizing strings Params: inputRow: a pyspark row Output: (location, tokens): a tuple of the location of the tweet and a list of tokens in the tweet """ # Allow us to select which fields get pulled for model text = [] if 'text' in fields: text.append(inputRow.text.strip()) if 'user.location' in fields: try: text.append(inputRow.location.strip()) except: text.append(inputRow.user.location.strip()) text = ' '.join(text) # Get true location if inputRow.geo and inputRow.geo.type == 'Point': location = inputRow.geo.coordinates elif inputRow.place and inputRow.place.bounding_box and inputRow.place.bounding_box.type =='Polygon' \ and inputRow.place.place_type in ['city','poi','neighborhood']: location = bb_center(inputRow.place.bounding_box.coordinates) else: location = None if 'text' in fields: # Clean up URLs in tweet updates_to_make = [] if inputRow.entities and inputRow.entities.urls: for url_row in inputRow.entities.urls: updates_to_make.append((url_row.url, urlparse.urlparse(url_row.expanded_url).netloc.replace('.', '_'))) if inputRow.extended_entities and inputRow.extended_entities.media: for media_row in inputRow.extended_entities.media: updates_to_make.append((media_row.url, urlparse.urlparse(media_row.expanded_url).netloc.replace('.', '_'))) for (original, new_string) in updates_to_make: #print(original, new_string) text = text.replace(original, new_string) # Convert to lowercase and get remove @mentions tokens = [] for item in text.lower().split(): if not item.startswith('@'): tokens.append(item) return (location, tokens)
def get_known_locs(sqlCtx, table_name, include_places=True, min_locs=3, num_partitions=30, dispersion_threshold=50): ''' Given a loaded twitter table, this will return all the twitter users with locations. A user's location is determined by the median location of all known tweets. A user must have at least min_locs locations in order for a location to be estimated Args: sqlCtx (Spark SQL Context) : A Spark SQL context table_name (string): Table name that was registered when loading the data min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user num_partitions (int) : Optimizer for specifying the number of partitions for the resulting RDD to use. dispersion_threhold (int) : A distance threhold on the dispersion of the estimated location for a user. We consider those estimated points with dispersion greater than the treshold unable to be predicted given how dispersed the tweet distances are from one another. Returns: locations (rdd of LocEstimate) : Found locations of users. This rdd is often used as the ground truth of locations ''' geo_coords = sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null' % table_name)\ .map(lambda row: (row.id_str, row.coordinates)) if (include_places): place_coords = sqlCtx.sql( "select user.id_str, place.bounding_box.coordinates from %s " % table_name + "where geo.coordinates is null and size(place.bounding_box.coordinates) > 0 and place.place_type " + "in ('city', 'neighborhood', 'poi')").map( lambda row: (row.id_str, bb_center(row.coordinates))) geo_coords = geo_coords.union(place_coords) return geo_coords.groupByKey()\ .filter(lambda (id_str,coord_list): len(coord_list) >= min_locs)\ .map(lambda (id_str,coords): (id_str, median(haversine, [LocEstimate(GeoCoord(lat,lon), None, None)\ for lat,lon in coords])))\ .filter(lambda (id_str, loc): loc.dispersion < dispersion_threshold)\ .coalesce(num_partitions).cache()
def get_location_from_tweet(row): """ Extract location from a tweet object. If geo.coordinates not present use center of place.bounding_box. Args: row (Row): A spark sql row containing a tweet Retruns: GeoCoord: The location in the tweet """ # Get true location if row.geo and row.geo.type == 'Point': ll = row.geo.coordinates location = GeoCoord(lat=ll[0], lon=ll[1]) elif row.place and row.place.bounding_box and row.place.bounding_box.type =='Polygon' \ and row.place.place_type in ['city','poi','neighborhood']: ll = bb_center(row.place.bounding_box.coordinates) location = GeoCoord(lat=ll[0], lon=ll[1]) else: location = None return location
def get_known_locs(sqlCtx, table_name, include_places=True, min_locs=3, num_partitions=30, dispersion_threshold=50): ''' Given a loaded twitter table, this will return all the twitter users with locations. A user's location is determined by the median location of all known tweets. A user must have at least min_locs locations in order for a location to be estimated Args: sqlCtx (Spark SQL Context) : A Spark SQL context table_name (string): Table name that was registered when loading the data min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user num_partitions (int) : Optimizer for specifying the number of partitions for the resulting RDD to use. dispersion_threhold (int) : A distance threhold on the dispersion of the estimated location for a user. We consider those estimated points with dispersion greater than the treshold unable to be predicted given how dispersed the tweet distances are from one another. Returns: locations (rdd of LocEstimate) : Found locations of users. This rdd is often used as the ground truth of locations ''' geo_coords = sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null' % table_name)\ .map(lambda row: (row.id_str, row.coordinates)) if(include_places): place_coords = sqlCtx.sql("select user.id_str, place.bounding_box.coordinates from %s "%table_name + "where geo.coordinates is null and size(place.bounding_box.coordinates) > 0 and place.place_type " + "in ('city', 'neighborhood', 'poi')").map(lambda row: (row.id_str, bb_center(row.coordinates))) geo_coords = geo_coords.union(place_coords) return geo_coords.groupByKey()\ .filter(lambda (id_str,coord_list): len(coord_list) >= min_locs)\ .map(lambda (id_str,coords): (id_str, median(haversine, [LocEstimate(GeoCoord(lat,lon), None, None)\ for lat,lon in coords])))\ .filter(lambda (id_str, loc): loc.dispersion < dispersion_threshold)\ .coalesce(num_partitions).cache()