def fetch_save_news_tweets_details(db, config, twarc_conntector): """Fetch the recursive replies and retweets regarding the fake tweets and save them to database""" twarc_conntector.change_params(window_limit=2, time_window=8) news_tweets = get_all_news_tweets(db, config) num_process = int(config["Selenium"]["num_process"]) news_tweets_chunks = chunkify(news_tweets, num_process) multiprocess_function(num_process, function_ref=fetch_save_news_tweets_details_job, args=(news_tweets_chunks, config, twarc_conntector))
def get_from_facebook(db): news_collection = db[Constants.NEWS_COLLECTION] url_list = [] num_process = 1 for query in [{ "ref_source_url": { "$regex": "archive" } }, { "ref_source_url": { "$regex": "facebook" } }]: for fb_url in news_collection.find( query, { "news_id": 1, "ref_source_url": 1, "ref_source.ref_archive_url": 1, "ref_source.text": 1 }): # "ref_source.text":{"$in":['NONE']} text = "" if fb_url: if "ref_source" in fb_url.keys( ) and "ref_archive_url" in fb_url['ref_source'].keys(): ref_source_url = fb_url['ref_source']['ref_archive_url'] text = fb_url['ref_source'].get("text", "") else: ref_source_url = fb_url['ref_source_url'] filter_list = [ "Sorry", "WATCH LIVE", "you may know", "See more", "Anyone can see", "This video may", "more reply", "Skip all", "Sign up", "in the group and what they post", "log in to continue", "having problems playing", "# GiselleMaxwellslist became", ] if len(text.split()) < 10 or any( [i in text for i in filter_list]): url_list.append((fb_url['news_id'], ref_source_url)) url_chunk_list = chunkify(url_list, num_process) multiprocess_function(num_process, job_from_facebook, (url_chunk_list, db))
def fetch_save_user_profile(db, screen_name_user_id_list, twarc_connector): twarc_connector.change_params(window_limit=900, time_window=900) num_process = int(config["Selenium"]["num_process"]) print("Length Before {}".format(len(screen_name_user_id_list))) screen_name_user_id_list = list(set(screen_name_user_id_list)) print("Length After {}".format(len(screen_name_user_id_list))) user_name_chunks = chunkify(list(screen_name_user_id_list), num_process) logging.info("Total no. of users to fetch profile info : {}".format( len(screen_name_user_id_list))) multiprocess_function(num_process, function_ref=fetch_save_user_profile_job, args=(user_name_chunks, twarc_connector, db))
def fact_check_crawler(name, page_count, db): if name == "poynter": main_url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/?covid_countries=0&covid_rating=0&covid_fact_checkers=0&orderby=views&order=DESC#038;covid_rating=0&covid_fact_checkers=0&orderby=views&order=DESC" crawl_func = page_in_poynter elif name == "snopes": # main_url = "https://www.snopes.com/fact-check/rating/false/page/{}" main_url = "https://www.snopes.com/fact-check/rating/false/page/{}" crawl_func = page_in_snope else: raise NotImplementedError news_collection = db[Constants.NEWS_COLLECTION] if page_count > 0: all_pages = list(range(1, page_count + 1, 1)) else: all_pages = [-1] num_process = os.cpu_count() - 3 all_pages_chunkify = chunkify(all_pages, num_process) multiprocess_function(num_process, function_ref=fetch_fact_check, args=(all_pages_chunkify, main_url, news_collection, crawl_func))
def ReliableCrawler(name, page_count, db): if name == "cdc": main_url = "https://www.cdc.gov/media/archives.html?Sort=Article%20Date%3A%3Adesc&Page={}" crawl_func = page_in_cdc elif name == "who": main_url = "https://www.who.int/news-room/releases/{}" crawl_func = page_in_who elif name == "nih": main_url = "https://search.nih.gov/search/docs?affiliate=nih&dc=565&page={}&query=covid-19&utf8=%E2%9C%93" crawl_func = page_in_nih elif name == "webmd": main_url = "https://www.webmd.com/search/search_results/default.aspx?query=covid19&page={}" crawl_func = page_in_webMD elif name == "smithsonianmag": main_url = "https://www.smithsonianmag.com/search/?q=covid-19&page={}" crawl_func = page_in_smithsonianmag elif name == "science_daily": main_url = "https://www.sciencedaily.com/search/?keyword=covid19#gsc.tab=0&gsc.q=covid%2019%20site%3Awww.sciencedaily.com&gsc.sort=&gsc.page={}" crawl_func = page_in_science_daily elif name == "healthline": main_url = "https://www.healthline.com/health-news?ref=global" crawl_func = page_in_healthline elif name == "ecdc": main_url = "./crawled_data/ecdc.html" crawl_func = page_in_ecdc elif name == "mnt": main_url = "https://www.medicalnewstoday.com/coronavirus" crawl_func = page_in_MNT elif name == "mayo_clinic": main_url = "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963" crawl_func = page_in_mayo_clinic elif name == "celeveland": main_url = "https://newsroom.clevelandclinic.org/category/news-releases/page/{}/" crawl_func = page_in_cleveland_clink elif name == "snopes": main_url = "https://www.snopes.com/news/page/{}" crawl_func = page_in_snopes elif name == "politico": main_url = "https://www.politico.com/search/{}?q=covid19" crawl_func = page_in_politico elif name == "dn": main_url = "{}" crawl_func = page_in_dn elif name == "publico": main_url = "{}" crawl_func = page_in_publico elif name == "afp": main_url = "https://www.afp.com/fr/search/results/covid-19?page={}&f[0]=im_field_tags%3A74" crawl_func = page_in_afp elif name == "elpais": main_url = "https://elpais.com/noticias/covid-19/{}/" crawl_func = page_in_elpais elif name == "abces": main_url = "https://www.abc.es/hemeroteca/resultados-busqueda-avanzada/todo/pagina-{}?tod=covid&nin=19" crawl_func = page_in_abces elif name == "animalpolitico": main_url = "{}" crawl_func = page_in_animalpolitico elif name == "lemonde": main_url = "https://www.lemonde.fr/recherche/?search_keywords=covid-19&start_at=03/01/2020&end_at=26/07/2020&search_sort=relevance_desc&page={}" crawl_func = page_in_lemonde elif name == "jn": main_url = "{}" crawl_func = page_in_jn elif name == "publico": main_url = "" crawl_func = page_in_publico elif name == "milenio": main_url = "https://www.efe.com/efe/espana/busqueda/50000538?q=covid-19&p={}&s=0" crawl_func = page_in_milenio else: raise NotImplementedError # TODO: Automatically extract the page number if page_count > 0: all_pages = list(range(1, page_count + 1, 1)) else: all_pages = [-1] num_process = os.cpu_count() - 3 all_pages_chunkify = chunkify(all_pages, num_process) multiprocess_function(num_process, function_ref=fetch_save_collection, args=(all_pages_chunkify, main_url, db, crawl_func))
def multiprocess_fetch_save_tweets(tweet_search_entries, twarc_connector, db): num_process = os.cpu_count() - 3 tweet_search_entry_chunks = chunkify(tweet_search_entries, num_process) multiprocess_function(num_process, fetch_save_tweets_job, (tweet_search_entry_chunks, twarc_connector, db))