def process_after_parsing(news_item, maps_key): location = '' news_item['accident'] = classify_ynet(news_item['title']) try: if news_item['accident']: if news_item['description'] != '': location = manual_filter_location_of_text( news_item['description']) if location == '': location = manual_filter_location_of_text(news_item['title']) news_item['location'] = location geo_location = geocode_extract(location, maps_key) if geo_location is not None: news_item['lat'] = geo_location['geom']['lat'] news_item['lon'] = geo_location['geom']['lng'] news_item['resolution'] = set_accident_resolution(geo_location) db_location = get_db_matching_location(news_item['lat'], news_item['lon'], news_item['resolution'], geo_location['road_no']) for col in [ 'region_hebrew', 'district_hebrew', 'yishuv_name', 'street1_hebrew', 'street2_hebrew', 'non_urban_intersection_hebrew', 'road1', 'road2', 'road_segment_name' ]: news_item[col] = db_location[col] except Exception as _: pass return news_item
def ynet_news_flash_crawl(rss_link, maps_key): """ starts crawling by given rss link, site name and google maps key :param rss_link: rss link to crawl and get news_flash from :param maps_key: google maps key for geocode :return: scraped news_flash are added to the db """ latest_date = get_latest_date_from_db('ynet') d = feedparser.parse(rss_link) process = CrawlerProcess() for entry in d.entries[::-1]: entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S') entry_parsed_date = entry_parsed_date.replace(tzinfo=None) if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None: news_item = { 'date_parsed': entry_parsed_date, 'title': entry.title, 'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0, 'accident': classify_ynet(entry.title), 'source': 'ynet' } process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key) process.start()
def extract_geo_features(parsed_item, google_maps_key): news_item = {**init_news_item_extracted_features(), **parsed_item} location = None news_item['accident'] = classify_ynet(news_item['title']) try: if news_item['accident']: if news_item['description'] is not None: location = manual_filter_location_of_text( news_item['description']) if location is None: location = manual_filter_location_of_text(news_item['title']) news_item['location'] = location geo_location = geocode_extract(location, google_maps_key) if geo_location is not None: news_item['lat'] = geo_location['geom']['lat'] news_item['lon'] = geo_location['geom']['lng'] news_item['resolution'] = set_accident_resolution(geo_location) db_location = get_db_matching_location(news_item['lat'], news_item['lon'], news_item['resolution'], geo_location['road_no']) for col in [ 'region_hebrew', 'district_hebrew', 'yishuv_name', 'street1_hebrew', 'street2_hebrew', 'non_urban_intersection_hebrew', 'road1', 'road2', 'road_segment_name' ]: news_item[col] = db_location[col] except Exception as _: pass return news_item
def extract_geo_features(parsed_item, google_maps_key): news_item = {**init_news_item_extracted_features(), **parsed_item} location = None news_item["accident"] = classify_ynet(news_item["title"]) try: if news_item["accident"]: if news_item["description"] is not None: location = manual_filter_location_of_text( news_item["description"]) if location is None: location = manual_filter_location_of_text(news_item["title"]) news_item["location"] = location geo_location = geocode_extract(location, google_maps_key) if geo_location is not None: news_item["lat"] = geo_location["geom"]["lat"] news_item["lon"] = geo_location["geom"]["lng"] news_item["resolution"] = set_accident_resolution(geo_location) db_location = get_db_matching_location( news_item["lat"], news_item["lon"], news_item["resolution"], geo_location["road_no"], ) for col in [ "region_hebrew", "district_hebrew", "yishuv_name", "street1_hebrew", "street2_hebrew", "non_urban_intersection_hebrew", "road1", "road2", "road_segment_name", ]: news_item[col] = db_location[col] except Exception as _: pass return news_item