def persist_uniques(self): # persists only if an element with this title does not exist yet in the database exist = FeedDataModel.query.filter(FeedDataModel.title == self.title).scalar() is not None if not exist: self.save() log.info("Persisted entry with title " + self.title) else: log.info("Skipping entry with title '" + self.title + "' as it already exists!")
def crawl_data(feed_urls: List[Source]): log.info("Collecting data from rss feeds...") rss_data_dict = crawl_rss_data(feed_urls) log.info("Collected " + str( len([ item for sublist in list(rss_data_dict.values()) for item in sublist ])) + " elements.") return rss_data_dict
def build_new_entries(self, rss_data, all_tags, all_languages, all_sources): entries = [] sources_map = {source.id: source for source in all_sources} for source_id in rss_data: source = sources_map.get(source_id) entries.append(self.__build_new_entries(self, rss_data[source_id], all_tags, all_languages, source)) log.info("New entities: " + str(len([item for sublist in entries for item in sublist]))) return entries
def create_crawler(filter_words, feed_urls): log.info("Configure rss crawler...") # set the global vals global keywords keywords = filter_words global feeds feeds = feed_urls log.info("Successfully configured rss crawler!")
def crawl_and_persist_data(): success = bool(False) try: log.info("Collecting data from rss feeds...") rss_data_list = crawl_rss_data() log.info("Collected " + str(len(rss_data_list)) + " elements.") log.info("Persisting collected data to database...") for feed_entry in rss_data_list: build_feed_data_and_persist(feed_entry) success = bool(True) except (SQOperationalError, PsyOperationalError, ProgrammingError) as e: log.error("Persisting data from crawled feed failed with exception: " + str(e)) if success: log.info("Successfully persisted rss feed data to database!")
def execute_crawler(uuid): if uuid != crawler_token: return custom_response("Invalid access token provided!", 400) if not (api_url): return custom_response( "API url not provided. Cannot crawl and persist data!", 400) # get entry handler handler = EntryHandler() # create connection to graphQL api db_con = GraphQLConnector(api_url=api_url) # get all data sources log.info("Try to get sources for new entities ...") sources = db_con.get_all_sources() log.info("Need to check " + str(len(sources)) + " sources.") # crawl sources from feedparser import FeedParserDict rss_data_list: Dict[str, List[FeedParserDict]] = crawl_data(sources) # get all existent entries that has been updated within the last 24h log.info("Try to get existing entries with time delta 24h from " + api_url + "...") existing_entries = db_con.get_all_entries( updated_at_gte=(dt.datetime.now() - timedelta(hours=24))) log.info("Loaded existing " + str(len(existing_entries)) + " entries successfully!") # check if they new crawled data contains intersecting items, # if yes remove them from the newly crawled data and update # the updated data of the old ones log.info("Determining new rss data and data that needs to be updated...") new_rss_data, existing_entries_4_update = handler.filter_duplicates( rss_data_list, existing_entries) log.info("Entities to be updated: " + str(len(existing_entries_4_update)) + "; New entities: " + str( len([ item for sublist in list(new_rss_data.values()) for item in sublist ]))) for entry in existing_entries_4_update: db_con.update_entry_updated_at(entry) # handle languages log.info("Detecting languages for new entities...") detected_languages = handler.detect_lang(new_rss_data) existing_languages = db_con.get_all_languages() for language in detected_languages: db_con.create_language_if_not_existent(language, existing_languages) log.info("Language detection complete.") # handle tags log.info("Determining tags for new entities...") determined_tags = handler.determine_tags(new_rss_data) existing_tags = db_con.get_all_tags() for tag in determined_tags: db_con.create_tag_if_not_existent(tag, existing_tags) log.info("Tag determination complete.") log.info("Build and persist new entities...") updated_languages = db_con.get_all_languages() updated_tags = db_con.get_all_tags() new_entries = handler.build_new_entries(new_rss_data, updated_tags, updated_languages, sources) # persist new entries for entry_set in new_entries: for entry in entry_set: db_con.create_entry(entry) log.info("Crawled and persisted " + str(len([item for sublist in new_entries for item in sublist])) + " rss feed elements!") return custom_response( "Crawled and persisted " + str(len(new_entries)) + " rss feed elements!", 200)