Ejemplo n.º 1
0
 def persist_uniques(self):
     # persists only if an element with this title does not exist yet in the database
     exist = FeedDataModel.query.filter(FeedDataModel.title == self.title).scalar() is not None
     if not exist:
         self.save()
         log.info("Persisted entry with title " + self.title)
     else:
         log.info("Skipping entry with title '" + self.title + "' as it already exists!")
Ejemplo n.º 2
0
def crawl_data(feed_urls: List[Source]):
    log.info("Collecting data from rss feeds...")
    rss_data_dict = crawl_rss_data(feed_urls)
    log.info("Collected " + str(
        len([
            item for sublist in list(rss_data_dict.values())
            for item in sublist
        ])) + " elements.")
    return rss_data_dict
Ejemplo n.º 3
0
    def build_new_entries(self, rss_data, all_tags, all_languages, all_sources):
        entries = []
        sources_map = {source.id: source for source in all_sources}

        for source_id in rss_data:
            source = sources_map.get(source_id)
            entries.append(self.__build_new_entries(self, rss_data[source_id], all_tags, all_languages, source))

        log.info("New entities: " + str(len([item for sublist in entries for item in sublist])))
        return entries
Ejemplo n.º 4
0
def create_crawler(filter_words, feed_urls):
    log.info("Configure rss crawler...")
    # set the global vals
    global keywords
    keywords = filter_words

    global feeds
    feeds = feed_urls

    log.info("Successfully configured rss crawler!")
Ejemplo n.º 5
0
def crawl_and_persist_data():
    success = bool(False)
    try:
        log.info("Collecting data from rss feeds...")
        rss_data_list = crawl_rss_data()
        log.info("Collected " + str(len(rss_data_list)) + " elements.")
        log.info("Persisting collected data to database...")
        for feed_entry in rss_data_list:
            build_feed_data_and_persist(feed_entry)
        success = bool(True)
    except (SQOperationalError, PsyOperationalError, ProgrammingError) as e:
        log.error("Persisting data from crawled feed failed with exception: " +
                  str(e))

    if success:
        log.info("Successfully persisted rss feed data to database!")
Ejemplo n.º 6
0
def execute_crawler(uuid):
    if uuid != crawler_token:
        return custom_response("Invalid access token provided!", 400)

    if not (api_url):
        return custom_response(
            "API url not provided. Cannot crawl and persist data!", 400)

    # get entry handler
    handler = EntryHandler()

    # create connection to graphQL api
    db_con = GraphQLConnector(api_url=api_url)

    # get all data sources
    log.info("Try to get sources for new entities ...")
    sources = db_con.get_all_sources()
    log.info("Need to check " + str(len(sources)) + " sources.")

    # crawl sources
    from feedparser import FeedParserDict
    rss_data_list: Dict[str, List[FeedParserDict]] = crawl_data(sources)

    # get all existent entries that has been updated within the last 24h
    log.info("Try to get existing entries with time delta 24h from " +
             api_url + "...")
    existing_entries = db_con.get_all_entries(
        updated_at_gte=(dt.datetime.now() - timedelta(hours=24)))
    log.info("Loaded existing " + str(len(existing_entries)) +
             " entries successfully!")

    # check if they new crawled data contains intersecting items,
    # if yes remove them from the newly crawled data and update
    # the updated data of the old ones
    log.info("Determining new rss data and data that needs to be updated...")
    new_rss_data, existing_entries_4_update = handler.filter_duplicates(
        rss_data_list, existing_entries)
    log.info("Entities to be updated: " + str(len(existing_entries_4_update)) +
             "; New entities: " + str(
                 len([
                     item for sublist in list(new_rss_data.values())
                     for item in sublist
                 ])))

    for entry in existing_entries_4_update:
        db_con.update_entry_updated_at(entry)

    # handle languages
    log.info("Detecting languages for new entities...")
    detected_languages = handler.detect_lang(new_rss_data)
    existing_languages = db_con.get_all_languages()
    for language in detected_languages:
        db_con.create_language_if_not_existent(language, existing_languages)
    log.info("Language detection complete.")

    # handle tags
    log.info("Determining tags for new entities...")
    determined_tags = handler.determine_tags(new_rss_data)
    existing_tags = db_con.get_all_tags()
    for tag in determined_tags:
        db_con.create_tag_if_not_existent(tag, existing_tags)
    log.info("Tag determination complete.")

    log.info("Build and persist new entities...")
    updated_languages = db_con.get_all_languages()
    updated_tags = db_con.get_all_tags()
    new_entries = handler.build_new_entries(new_rss_data, updated_tags,
                                            updated_languages, sources)

    # persist new entries
    for entry_set in new_entries:
        for entry in entry_set:
            db_con.create_entry(entry)

    log.info("Crawled and persisted " +
             str(len([item for sublist in new_entries
                      for item in sublist])) + " rss feed elements!")
    return custom_response(
        "Crawled and persisted " + str(len(new_entries)) +
        " rss feed elements!", 200)