def test_extract_location(): if not secrets.exists("GOOGLE_MAPS_KEY"): pytest.skip("Could not find GOOGLE_MAPS_KEY") parsed = dict( link="https://twitter.com/mda_israel/status/1253010741080326148", title='בשעה 19:39 התקבל דיווח במוקד 101 של מד"א במרחב דן על הולכת רגל שככל הנראה נפגעה מאופנוע ברחוב ביאליק ברמת גן. צוותי מד"א מעניקים טיפול ומפנים לבי"ח איכילוב 2 פצועים: אישה כבת 30 במצב קשה, עם חבלה רב מערכתית ורוכב האופנוע, צעיר בן 18 במצב בינוני, עם חבלות בראש ובגפיים.', description='בשעה 19:39 התקבל דיווח במוקד 101 של מד"א במרחב דן על הולכת רגל שככל הנראה נפגעה מאופנוע ברחוב ביאליק ברמת גן. צוותי מד"א מעניקים טיפול ומפנים לבי"ח איכילוב 2 פצועים: אישה כבת 30 במצב קשה, עם חבלה רב מערכתית ורוכב האופנוע, צעיר בן 18 במצב בינוני, עם חבלות בראש ובגפיים.', source="twitter", tweet_id=1253010741080326144, author="מגן דוד אדום", date=datetime.datetime(2020, 4, 22, 19, 39, 51), accident=True, ) expected = NewsFlash( **parsed, lat=32.0861791, lon=34.8098462, resolution="רחוב", location="רחוב ביאליק ברמת גן", road_segment_name=None, district_hebrew=None, non_urban_intersection_hebrew=None, region_hebrew=None, road1=None, road2=None, street1_hebrew="ביאליק", street2_hebrew=None, yishuv_name="רמת גן", ) actual = NewsFlash(**parsed) location_extraction.extract_geo_features(init_db(), actual) for k in to_dict(expected): assert getattr(actual, k) == getattr(expected, k)
def scrape_all(): """ main function for newsflash scraping """ sys.path.append(os.path.dirname(os.path.realpath(__file__))) db = init_db() scrape_extract_store_rss("ynet", db) scrape_extract_store_rss("walla", db) scrape_extract_store_twitter("mda_israel", db)
def main(): """ main function for beginning of the news flash process """ sys.path.append(os.path.dirname(os.path.realpath(__file__))) db = init_db() rss_sites.scrape_extract_store("ynet", db) rss_sites.scrape_extract_store("walla", db) twitter.scrape_extract_store("mda_israel", db)
def main(google_maps_key): """ main function for beginning of the news flash process :param google_maps_key: google maps key """ sys.path.append(os.path.dirname(os.path.realpath(__file__))) db = init_db() rss_sites.scrape_extract_store('ynet', google_maps_key, db) rss_sites.scrape_extract_store('walla', google_maps_key, db) mda_twitter(google_maps_key)
def main(google_maps_key, source=None, news_flash_id=None): db = news_flash_db_adapter.init_db() if news_flash_id is not None: news_flash_data = db.get_all_news_flash_data_for_updates( id=news_flash_id) elif source is not None: news_flash_data = db.get_all_news_flash_data_for_updates(source=source) else: news_flash_data = db.get_all_news_flash_data_for_updates() if len(news_flash_data) > 0: update_news_flash(db, google_maps_key, news_flash_data) else: logging.info('no matching news flash found, source={0}, id={1}'.format( source, news_flash_id))
def update_all_in_db(source=None, newsflash_id=None): """ main function for newsflash updating. Should be executed each time the classification or location-extraction are updated. """ db = init_db() if newsflash_id is not None: newsflash_items = db.get_newsflash_by_id(newsflash_id) elif source is not None: newsflash_items = db.select_newsflash_where_source(source) else: newsflash_items = db.get_all_newsflash() for newsflash in newsflash_items: classify = news_flash_classifiers[newsflash.source] newsflash.organization = classify_organization(newsflash.source) newsflash.accident = classify(newsflash.description or newsflash.title) if newsflash.accident: extract_geo_features(db, newsflash) db.commit()
def remove_duplicate_news_flash_rows(): from anyway.parsers import news_flash_db_adapter news_flash_db_adapter.init_db().remove_duplicate_rows()
def mda_twitter(google_maps_key): TWITTER_CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY") TWITTER_CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET") TWITTER_ACCESS_KEY = os.environ.get("TWITTER_ACCESS_KEY") TWITTER_ACCESS_SECRET = os.environ.get("TWITTER_ACCESS_SECRET") twitter_user = "******" db = news_flash_db_adapter.init_db() latest_tweet_id = db.get_latest_tweet_id_from_db() # check if there are any MDA tweets in the DB if latest_tweet_id: mda_tweets = get_user_tweets( twitter_user, latest_tweet_id, TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET, google_maps_key, ) else: mda_tweets = get_user_tweets( twitter_user, "no_tweets", TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET, google_maps_key, ) if mda_tweets is None: return mda_tweets = mda_tweets.loc[ :, [ "tweet_id", "title", "link", "date", "author", "description", "location", "lat", "lon", "resolution", "region_hebrew", "district_hebrew", "yishuv_name", "street1_hebrew", "street2_hebrew", "non_urban_intersection_hebrew", "road1", "road2", "road_segment_name", "accident", "source", ], ] for row in mda_tweets.itertuples(index=False): ( tweet_id, title, link, date, author, description, location, lat, lon, resolution, region_hebrew, district_hebrew, yishuv_name, street1_hebrew, street2_hebrew, non_urban_intersection_hebrew, road1, road2, road_segment_name, accident, source, ) = row db.insert_new_flash_news( title, link, date, author, description, location, lat, lon, resolution, region_hebrew, district_hebrew, yishuv_name, street1_hebrew, street2_hebrew, non_urban_intersection_hebrew, road1, road2, road_segment_name, accident, source, tweet_id=tweet_id, )
def test_sanity_get_latest_date(): db = init_db() db.get_latest_date_of_source("ynet") db.get_latest_date_of_source("walla") db.get_latest_date_of_source("twitter")
def test_sanity_get_latest_date(): db = init_db() db.get_latest_date_of_source('ynet') db.get_latest_date_of_source('walla') db.get_latest_date_of_source('twitter')
def mda_twitter(google_maps_key): TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY') TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET') TWITTER_ACCESS_KEY = os.environ.get('TWITTER_ACCESS_KEY') TWITTER_ACCESS_SECRET = os.environ.get('TWITTER_ACCESS_SECRET') twitter_user = '******' db = news_flash_db_adapter.init_db() latest_tweet_id = db.get_latest_tweet_id_from_db() # check if there are any MDA tweets in the DB if latest_tweet_id: mda_tweets = get_user_tweets(twitter_user, latest_tweet_id, TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET, google_maps_key) else: mda_tweets = get_user_tweets(twitter_user, 'no_tweets', TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET, google_maps_key) if mda_tweets is None: return mda_tweets = mda_tweets.loc[:, [ 'tweet_id', 'title', 'link', 'date', 'author', 'description', 'location', 'lat', 'lon', 'resolution', 'region_hebrew', 'district_hebrew', 'yishuv_name', 'street1_hebrew', 'street2_hebrew', 'non_urban_intersection_hebrew', 'road1', 'road2', 'road_segment_name', 'accident', 'source' ]] for row in mda_tweets.itertuples(index=False): (tweet_id, title, link, date, author, description, location, lat, lon, resolution, region_hebrew, district_hebrew, yishuv_name, street1_hebrew, street2_hebrew, non_urban_intersection_hebrew, road1, road2, road_segment_name, accident, source) = row db.insert_new_flash_news(title, link, date, author, description, location, lat, lon, resolution, region_hebrew, district_hebrew, yishuv_name, street1_hebrew, street2_hebrew, non_urban_intersection_hebrew, road1, road2, road_segment_name, accident, source, tweet_id=tweet_id)
def get_db_matching_location(latitude, longitude, resolution, road_no=None): """ extracts location from db by closest geo point to location found, using road number if provided and limits to requested resolution :param latitude: location latitude :param longitude: location longitude :param resolution: wanted resolution :param road_no: road number if there is :return: a dict containing all the geo fields stated in resolution dict, with values filled according to resolution """ final_loc = {} for field in resolution_dict['אחר']: final_loc[field] = None try: # READ MARKERS FROM DB geod = Geodesic.WGS84 relevant_fields = resolution_dict[resolution] markers = init_db().get_markers_for_location_extraction() markers['geohash'] = markers.apply(lambda x: geohash.encode( x['latitude'], x['longitude'], precision=4), axis=1) markers_orig = markers.copy() if resolution != 'אחר': if road_no is not None and road_no > 0 and ( 'road1' in relevant_fields or 'road2' in relevant_fields): markers = markers.loc[(markers['road1'] == road_no) | (markers['road2'] == road_no)] for field in relevant_fields: if field == 'road1': markers = markers.loc[markers[field].notnull()] markers = markers.loc[markers[field] > 0] elif field == 'region_hebrew' or field == 'district_hebrew' or \ field == 'yishuv_name' or field == 'street1_hebrew': markers = markers.loc[markers[field].notnull()] markers = markers.loc[markers[field] != ''] if markers.count()[0] == 0: markers = markers_orig # FILTER BY GEOHASH curr_geohash = geohash.encode(latitude, longitude, precision=4) if markers.loc[markers['geohash'] == curr_geohash].count()[0] > 0: markers = markers.loc[markers['geohash'] == curr_geohash].copy() # CREATE DISTANCE FIELD markers['dist_point'] = markers.apply(lambda x: geod.Inverse( latitude, longitude, x['latitude'], x['longitude'])['s12'], axis=1) markers = markers.replace({pd.np.nan: None}) most_fit_loc = markers.loc[ markers['dist_point'] == markers['dist_point'].min()].iloc[0].to_dict() for field in relevant_fields: if most_fit_loc[field] is not None: if (type(most_fit_loc[field]) == str and (most_fit_loc[field] == '' or most_fit_loc[field] == 'nan')) \ or (type(most_fit_loc[field]) == np.float64 and np.isnan(most_fit_loc[field])): final_loc[field] = None else: final_loc[field] = most_fit_loc[field] except Exception as _: logging.info( 'db matching failed for latitude {0}, longitude {1}, resolution {2}, road no {3}' .format(latitude, longitude, resolution, road_no)) return final_loc