Exemple #1
0
def collapse_rows(rows, min_locs, dispersion_threshold):
    '''
    Enable us to take the output from a spark sql query and simultaneously extract the estimated locations and contact
    counts
    Args:
        rows (pypsark Row): output of spark sql query
        min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user
        dispersion_threshold (int) : A distance threhold on the dispersion of the estimated location for a user.
            We consider those estimated points with dispersion greater than the treshold unable to be
            predicted given how dispersed the tweet distances are from one another.

    Returns:

    '''
    mention_count = defaultdict(int)
    geo_coords = []
    for row in rows:
        # Aggregate geo locations to calcuate median position
        if row.geo is not None:
            lat,lon = row.geo
            geo_coords.append(LocEstimate(GeoCoord(lat,lon), None, None))
        elif row.place is not None and row.place_type in ['city', 'neighborhood', 'poi']:
            lat,lon = bb_center(row.place)
            geo_coords.append(LocEstimate(GeoCoord(lat,lon), None, None))
        # Count user @mentions
        if len(row.mentions) > 0:
            for mention in row.mentions:
                mention_count[long(mention)] += 1
    if len(geo_coords) >= min_locs:
        loc_estimate = median(haversine, geo_coords)
        if dispersion_threshold is not None and loc_estimate.dispersion > dispersion_threshold:
            loc_estimate = None
    else:
        loc_estimate = None
    return (loc_estimate, mention_count)
Exemple #2
0
    def tokenize(inputRow, fields=set(['text'])):
        """Initial stand in attempt at tokenizing strings
        Params:
          inputRow: a pyspark row
        Output:
        (location, tokens): a tuple of the location of the tweet and a list of tokens in the tweet
        """
        # Allow us to select which fields get pulled for model
        text = []
        if 'text' in fields:
            text.append(inputRow.text.strip())
        if 'user.location' in fields:
            try:
                text.append(inputRow.location.strip())
            except:
                text.append(inputRow.user.location.strip())
        text = ' '.join(text)
        # Get true location
        if inputRow.geo and inputRow.geo.type == 'Point':
            location = inputRow.geo.coordinates
        elif inputRow.place and inputRow.place.bounding_box and inputRow.place.bounding_box.type =='Polygon' \
                and inputRow.place.place_type in ['city','poi','neighborhood']:
            location = bb_center(inputRow.place.bounding_box.coordinates)
        else:
            location = None

        if 'text' in fields:
            # Clean up URLs in tweet
            updates_to_make = []
            if inputRow.entities and inputRow.entities.urls:
                for url_row in inputRow.entities.urls:
                    updates_to_make.append(
                        (url_row.url, urlparse.urlparse(
                            url_row.expanded_url).netloc.replace('.', '_')))
            if inputRow.extended_entities and inputRow.extended_entities.media:
                for media_row in inputRow.extended_entities.media:
                    updates_to_make.append(
                        (media_row.url,
                         urlparse.urlparse(
                             media_row.expanded_url).netloc.replace('.', '_')))
            for (original, new_string) in updates_to_make:
                #print(original, new_string)
                text = text.replace(original, new_string)

        # Convert to lowercase and get remove @mentions
        tokens = []
        for item in text.lower().split():
            if not item.startswith('@'):
                tokens.append(item)
        return (location, tokens)
Exemple #3
0
    def tokenize(inputRow, fields=set(['text'])):
        """Initial stand in attempt at tokenizing strings
        Params:
          inputRow: a pyspark row
        Output:
        (location, tokens): a tuple of the location of the tweet and a list of tokens in the tweet
        """
        # Allow us to select which fields get pulled for model
        text = []
        if 'text' in fields:
            text.append(inputRow.text.strip())
        if 'user.location' in fields:
            try:
                text.append(inputRow.location.strip())
            except:
                text.append(inputRow.user.location.strip())
        text = ' '.join(text)
        # Get true location
        if inputRow.geo and inputRow.geo.type == 'Point':
            location = inputRow.geo.coordinates
        elif inputRow.place and inputRow.place.bounding_box and inputRow.place.bounding_box.type =='Polygon' \
                and inputRow.place.place_type in ['city','poi','neighborhood']:
            location = bb_center(inputRow.place.bounding_box.coordinates)
        else:
            location = None

        if 'text' in fields:
            # Clean up URLs in tweet
            updates_to_make = []
            if inputRow.entities and inputRow.entities.urls:
                for url_row in inputRow.entities.urls:
                    updates_to_make.append((url_row.url, urlparse.urlparse(url_row.expanded_url).netloc.replace('.', '_')))
            if inputRow.extended_entities and inputRow.extended_entities.media:
                for media_row in inputRow.extended_entities.media:
                    updates_to_make.append((media_row.url, urlparse.urlparse(media_row.expanded_url).netloc.replace('.', '_')))
            for (original, new_string) in updates_to_make:
                #print(original, new_string)
                text = text.replace(original, new_string)

        # Convert to lowercase and get remove @mentions
        tokens = []
        for item in text.lower().split():
            if not item.startswith('@'):
                tokens.append(item)
        return (location, tokens)
Exemple #4
0
def get_known_locs(sqlCtx,
                   table_name,
                   include_places=True,
                   min_locs=3,
                   num_partitions=30,
                   dispersion_threshold=50):
    '''
    Given a loaded twitter table, this will return all the twitter users with locations. A user's location is determined
    by the median location of all known tweets. A user must have at least min_locs locations in order for a location to be
    estimated


    Args:
        sqlCtx (Spark SQL Context) :  A Spark SQL context
        table_name (string): Table name that was registered when loading the data
        min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user
        num_partitions (int) : Optimizer for specifying the number of partitions for the resulting
            RDD to use.
        dispersion_threhold (int) : A distance threhold on the dispersion of the estimated location for a user.
            We consider those estimated points with dispersion greater than the treshold unable to be
            predicted given how dispersed the tweet distances are from one another.

    Returns:
        locations (rdd of LocEstimate) : Found locations of users. This rdd is often used as the ground truth of locations
    '''

    geo_coords = sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null' % table_name)\
        .map(lambda row: (row.id_str, row.coordinates))

    if (include_places):
        place_coords = sqlCtx.sql(
            "select user.id_str, place.bounding_box.coordinates from %s " %
            table_name +
            "where geo.coordinates is null and size(place.bounding_box.coordinates) > 0 and place.place_type "
            + "in ('city', 'neighborhood', 'poi')").map(
                lambda row: (row.id_str, bb_center(row.coordinates)))
        geo_coords = geo_coords.union(place_coords)

    return geo_coords.groupByKey()\
        .filter(lambda (id_str,coord_list): len(coord_list) >= min_locs)\
            .map(lambda (id_str,coords): (id_str, median(haversine, [LocEstimate(GeoCoord(lat,lon), None, None)\
                for lat,lon in coords])))\
            .filter(lambda (id_str, loc): loc.dispersion < dispersion_threshold)\
            .coalesce(num_partitions).cache()
Exemple #5
0
def get_location_from_tweet(row):
    """
    Extract location from a tweet object. If geo.coordinates not present use center of place.bounding_box.

    Args:
        row (Row): A spark sql row containing a tweet

    Retruns:
        GeoCoord: The location in the tweet
    """
    # Get true location
    if row.geo and row.geo.type == 'Point':
        ll = row.geo.coordinates
        location = GeoCoord(lat=ll[0], lon=ll[1])
    elif row.place and row.place.bounding_box and row.place.bounding_box.type =='Polygon' \
            and row.place.place_type in ['city','poi','neighborhood']:
        ll = bb_center(row.place.bounding_box.coordinates)
        location = GeoCoord(lat=ll[0], lon=ll[1])
    else:
        location = None

    return location
Exemple #6
0
def get_location_from_tweet(row):
    """
    Extract location from a tweet object. If geo.coordinates not present use center of place.bounding_box.

    Args:
        row (Row): A spark sql row containing a tweet

    Retruns:
        GeoCoord: The location in the tweet
    """
    # Get true location
    if row.geo and row.geo.type == 'Point':
        ll = row.geo.coordinates
        location = GeoCoord(lat=ll[0], lon=ll[1])
    elif row.place and row.place.bounding_box and row.place.bounding_box.type =='Polygon' \
            and row.place.place_type in ['city','poi','neighborhood']:
        ll = bb_center(row.place.bounding_box.coordinates)
        location = GeoCoord(lat=ll[0], lon=ll[1])
    else:
        location = None

    return location
Exemple #7
0
def get_known_locs(sqlCtx, table_name, include_places=True,  min_locs=3, num_partitions=30, dispersion_threshold=50):
    '''
    Given a loaded twitter table, this will return all the twitter users with locations. A user's location is determined
    by the median location of all known tweets. A user must have at least min_locs locations in order for a location to be
    estimated


    Args:
        sqlCtx (Spark SQL Context) :  A Spark SQL context
        table_name (string): Table name that was registered when loading the data
        min_locs (int) : Minimum number tweets that have a location in order to infer a location for the user
        num_partitions (int) : Optimizer for specifying the number of partitions for the resulting
            RDD to use.
        dispersion_threhold (int) : A distance threhold on the dispersion of the estimated location for a user.
            We consider those estimated points with dispersion greater than the treshold unable to be
            predicted given how dispersed the tweet distances are from one another.

    Returns:
        locations (rdd of LocEstimate) : Found locations of users. This rdd is often used as the ground truth of locations
    '''

    geo_coords = sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null' % table_name)\
        .map(lambda row: (row.id_str, row.coordinates))

    if(include_places):
        place_coords = sqlCtx.sql("select user.id_str, place.bounding_box.coordinates from %s "%table_name +
            "where geo.coordinates is null and size(place.bounding_box.coordinates) > 0 and place.place_type " +
            "in ('city', 'neighborhood', 'poi')").map(lambda row: (row.id_str, bb_center(row.coordinates)))
        geo_coords = geo_coords.union(place_coords)

    return geo_coords.groupByKey()\
        .filter(lambda (id_str,coord_list): len(coord_list) >= min_locs)\
            .map(lambda (id_str,coords): (id_str, median(haversine, [LocEstimate(GeoCoord(lat,lon), None, None)\
                for lat,lon in coords])))\
            .filter(lambda (id_str, loc): loc.dispersion < dispersion_threshold)\
            .coalesce(num_partitions).cache()