Ejemplo n.º 1
0
def test_extract_location():
    if not secrets.exists("GOOGLE_MAPS_KEY"):
        pytest.skip("Could not find GOOGLE_MAPS_KEY")

    parsed = dict(
        link="https://twitter.com/mda_israel/status/1253010741080326148",
        title='בשעה 19:39 התקבל דיווח במוקד 101 של מד"א במרחב דן על הולכת רגל שככל הנראה נפגעה מאופנוע ברחוב ביאליק ברמת גן. צוותי מד"א מעניקים טיפול ומפנים לבי"ח איכילוב 2 פצועים: אישה כבת 30 במצב קשה, עם חבלה רב מערכתית ורוכב האופנוע, צעיר בן 18 במצב בינוני, עם חבלות בראש ובגפיים.',
        description='בשעה 19:39 התקבל דיווח במוקד 101 של מד"א במרחב דן על הולכת רגל שככל הנראה נפגעה מאופנוע ברחוב ביאליק ברמת גן. צוותי מד"א מעניקים טיפול ומפנים לבי"ח איכילוב 2 פצועים: אישה כבת 30 במצב קשה, עם חבלה רב מערכתית ורוכב האופנוע, צעיר בן 18 במצב בינוני, עם חבלות בראש ובגפיים.',
        source="twitter",
        tweet_id=1253010741080326144,
        author="מגן דוד אדום",
        date=datetime.datetime(2020, 4, 22, 19, 39, 51),
        accident=True,
    )
    expected = NewsFlash(
        **parsed,
        lat=32.0861791,
        lon=34.8098462,
        resolution="רחוב",
        location="רחוב ביאליק ברמת גן",
        road_segment_name=None,
        district_hebrew=None,
        non_urban_intersection_hebrew=None,
        region_hebrew=None,
        road1=None,
        road2=None,
        street1_hebrew="ביאליק",
        street2_hebrew=None,
        yishuv_name="רמת גן",
    )

    actual = NewsFlash(**parsed)
    location_extraction.extract_geo_features(init_db(), actual)
    for k in to_dict(expected):
        assert getattr(actual, k) == getattr(expected, k)
Ejemplo n.º 2
0
def scrape_all():
    """
    main function for newsflash scraping
    """
    sys.path.append(os.path.dirname(os.path.realpath(__file__)))
    db = init_db()
    scrape_extract_store_rss("ynet", db)
    scrape_extract_store_rss("walla", db)
    scrape_extract_store_twitter("mda_israel", db)
Ejemplo n.º 3
0
def main():
    """
    main function for beginning of the news flash process
    """
    sys.path.append(os.path.dirname(os.path.realpath(__file__)))
    db = init_db()
    rss_sites.scrape_extract_store("ynet", db)
    rss_sites.scrape_extract_store("walla", db)
    twitter.scrape_extract_store("mda_israel", db)
Ejemplo n.º 4
0
def main(google_maps_key):
    """
    main function for beginning of the news flash process
    :param google_maps_key: google maps key
    """
    sys.path.append(os.path.dirname(os.path.realpath(__file__)))
    db = init_db()
    rss_sites.scrape_extract_store('ynet', google_maps_key, db)
    rss_sites.scrape_extract_store('walla', google_maps_key, db)
    mda_twitter(google_maps_key)
Ejemplo n.º 5
0
def main(google_maps_key, source=None, news_flash_id=None):
    db = news_flash_db_adapter.init_db()
    if news_flash_id is not None:
        news_flash_data = db.get_all_news_flash_data_for_updates(
            id=news_flash_id)
    elif source is not None:
        news_flash_data = db.get_all_news_flash_data_for_updates(source=source)
    else:
        news_flash_data = db.get_all_news_flash_data_for_updates()
    if len(news_flash_data) > 0:
        update_news_flash(db, google_maps_key, news_flash_data)
    else:
        logging.info('no matching news flash found, source={0}, id={1}'.format(
            source, news_flash_id))
Ejemplo n.º 6
0
def update_all_in_db(source=None, newsflash_id=None):
    """
    main function for newsflash updating.

    Should be executed each time the classification or location-extraction are updated.
    """
    db = init_db()
    if newsflash_id is not None:
        newsflash_items = db.get_newsflash_by_id(newsflash_id)
    elif source is not None:
        newsflash_items = db.select_newsflash_where_source(source)
    else:
        newsflash_items = db.get_all_newsflash()

    for newsflash in newsflash_items:
        classify = news_flash_classifiers[newsflash.source]
        newsflash.organization = classify_organization(newsflash.source)
        newsflash.accident = classify(newsflash.description or newsflash.title)
        if newsflash.accident:
            extract_geo_features(db, newsflash)
    db.commit()
Ejemplo n.º 7
0
def remove_duplicate_news_flash_rows():
    from anyway.parsers import news_flash_db_adapter

    news_flash_db_adapter.init_db().remove_duplicate_rows()
Ejemplo n.º 8
0
def mda_twitter(google_maps_key):
    TWITTER_CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
    TWITTER_CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")
    TWITTER_ACCESS_KEY = os.environ.get("TWITTER_ACCESS_KEY")
    TWITTER_ACCESS_SECRET = os.environ.get("TWITTER_ACCESS_SECRET")

    twitter_user = "******"

    db = news_flash_db_adapter.init_db()

    latest_tweet_id = db.get_latest_tweet_id_from_db()

    # check if there are any MDA tweets in the DB
    if latest_tweet_id:
        mda_tweets = get_user_tweets(
            twitter_user,
            latest_tweet_id,
            TWITTER_CONSUMER_KEY,
            TWITTER_CONSUMER_SECRET,
            TWITTER_ACCESS_KEY,
            TWITTER_ACCESS_SECRET,
            google_maps_key,
        )
    else:
        mda_tweets = get_user_tweets(
            twitter_user,
            "no_tweets",
            TWITTER_CONSUMER_KEY,
            TWITTER_CONSUMER_SECRET,
            TWITTER_ACCESS_KEY,
            TWITTER_ACCESS_SECRET,
            google_maps_key,
        )
    if mda_tweets is None:
        return

    mda_tweets = mda_tweets.loc[
        :,
        [
            "tweet_id",
            "title",
            "link",
            "date",
            "author",
            "description",
            "location",
            "lat",
            "lon",
            "resolution",
            "region_hebrew",
            "district_hebrew",
            "yishuv_name",
            "street1_hebrew",
            "street2_hebrew",
            "non_urban_intersection_hebrew",
            "road1",
            "road2",
            "road_segment_name",
            "accident",
            "source",
        ],
    ]

    for row in mda_tweets.itertuples(index=False):
        (
            tweet_id,
            title,
            link,
            date,
            author,
            description,
            location,
            lat,
            lon,
            resolution,
            region_hebrew,
            district_hebrew,
            yishuv_name,
            street1_hebrew,
            street2_hebrew,
            non_urban_intersection_hebrew,
            road1,
            road2,
            road_segment_name,
            accident,
            source,
        ) = row

        db.insert_new_flash_news(
            title,
            link,
            date,
            author,
            description,
            location,
            lat,
            lon,
            resolution,
            region_hebrew,
            district_hebrew,
            yishuv_name,
            street1_hebrew,
            street2_hebrew,
            non_urban_intersection_hebrew,
            road1,
            road2,
            road_segment_name,
            accident,
            source,
            tweet_id=tweet_id,
        )
Ejemplo n.º 9
0
def test_sanity_get_latest_date():
    db = init_db()
    db.get_latest_date_of_source("ynet")
    db.get_latest_date_of_source("walla")
    db.get_latest_date_of_source("twitter")
Ejemplo n.º 10
0
def test_sanity_get_latest_date():
    db = init_db()
    db.get_latest_date_of_source('ynet')
    db.get_latest_date_of_source('walla')
    db.get_latest_date_of_source('twitter')
Ejemplo n.º 11
0
def mda_twitter(google_maps_key):
    TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
    TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')
    TWITTER_ACCESS_KEY = os.environ.get('TWITTER_ACCESS_KEY')
    TWITTER_ACCESS_SECRET = os.environ.get('TWITTER_ACCESS_SECRET')

    twitter_user = '******'

    db = news_flash_db_adapter.init_db()

    latest_tweet_id = db.get_latest_tweet_id_from_db()

    # check if there are any MDA tweets in the DB
    if latest_tweet_id:
        mda_tweets = get_user_tweets(twitter_user, latest_tweet_id,
                                     TWITTER_CONSUMER_KEY,
                                     TWITTER_CONSUMER_SECRET,
                                     TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET,
                                     google_maps_key)
    else:
        mda_tweets = get_user_tweets(twitter_user, 'no_tweets',
                                     TWITTER_CONSUMER_KEY,
                                     TWITTER_CONSUMER_SECRET,
                                     TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET,
                                     google_maps_key)
    if mda_tweets is None:
        return

    mda_tweets = mda_tweets.loc[:, [
        'tweet_id', 'title', 'link', 'date', 'author', 'description',
        'location', 'lat', 'lon', 'resolution', 'region_hebrew',
        'district_hebrew', 'yishuv_name', 'street1_hebrew', 'street2_hebrew',
        'non_urban_intersection_hebrew', 'road1', 'road2', 'road_segment_name',
        'accident', 'source'
    ]]

    for row in mda_tweets.itertuples(index=False):
        (tweet_id, title, link, date, author, description, location, lat, lon,
         resolution, region_hebrew, district_hebrew, yishuv_name,
         street1_hebrew, street2_hebrew, non_urban_intersection_hebrew, road1,
         road2, road_segment_name, accident, source) = row

        db.insert_new_flash_news(title,
                                 link,
                                 date,
                                 author,
                                 description,
                                 location,
                                 lat,
                                 lon,
                                 resolution,
                                 region_hebrew,
                                 district_hebrew,
                                 yishuv_name,
                                 street1_hebrew,
                                 street2_hebrew,
                                 non_urban_intersection_hebrew,
                                 road1,
                                 road2,
                                 road_segment_name,
                                 accident,
                                 source,
                                 tweet_id=tweet_id)
Ejemplo n.º 12
0
def get_db_matching_location(latitude, longitude, resolution, road_no=None):
    """
    extracts location from db by closest geo point to location found, using road number if provided and limits to
    requested resolution
    :param latitude: location latitude
    :param longitude: location longitude
    :param resolution: wanted resolution
    :param road_no: road number if there is
    :return: a dict containing all the geo fields stated in
    resolution dict, with values filled according to resolution
    """
    final_loc = {}
    for field in resolution_dict['אחר']:
        final_loc[field] = None
    try:
        # READ MARKERS FROM DB
        geod = Geodesic.WGS84
        relevant_fields = resolution_dict[resolution]
        markers = init_db().get_markers_for_location_extraction()
        markers['geohash'] = markers.apply(lambda x: geohash.encode(
            x['latitude'], x['longitude'], precision=4),
                                           axis=1)
        markers_orig = markers.copy()
        if resolution != 'אחר':
            if road_no is not None and road_no > 0 and (
                    'road1' in relevant_fields or 'road2' in relevant_fields):
                markers = markers.loc[(markers['road1'] == road_no) |
                                      (markers['road2'] == road_no)]
            for field in relevant_fields:
                if field == 'road1':
                    markers = markers.loc[markers[field].notnull()]
                    markers = markers.loc[markers[field] > 0]
                elif field == 'region_hebrew' or field == 'district_hebrew' or \
                        field == 'yishuv_name' or field == 'street1_hebrew':
                    markers = markers.loc[markers[field].notnull()]
                    markers = markers.loc[markers[field] != '']
        if markers.count()[0] == 0:
            markers = markers_orig

        # FILTER BY GEOHASH
        curr_geohash = geohash.encode(latitude, longitude, precision=4)
        if markers.loc[markers['geohash'] == curr_geohash].count()[0] > 0:
            markers = markers.loc[markers['geohash'] == curr_geohash].copy()

        # CREATE DISTANCE FIELD
        markers['dist_point'] = markers.apply(lambda x: geod.Inverse(
            latitude, longitude, x['latitude'], x['longitude'])['s12'],
                                              axis=1)
        markers = markers.replace({pd.np.nan: None})
        most_fit_loc = markers.loc[
            markers['dist_point'] ==
            markers['dist_point'].min()].iloc[0].to_dict()
        for field in relevant_fields:
            if most_fit_loc[field] is not None:
                if (type(most_fit_loc[field]) == str and (most_fit_loc[field] == '' or most_fit_loc[field] == 'nan')) \
                        or (type(most_fit_loc[field]) == np.float64 and np.isnan(most_fit_loc[field])):
                    final_loc[field] = None
                else:
                    final_loc[field] = most_fit_loc[field]

    except Exception as _:
        logging.info(
            'db matching failed for latitude {0}, longitude {1}, resolution {2}, road no {3}'
            .format(latitude, longitude, resolution, road_no))
    return final_loc