Python multiprocess_function Examples, util.Util.multiprocess_function Python Examples

Example #1

0

Show file

def fetch_save_news_tweets_details(db, config, twarc_conntector):
    """Fetch the recursive replies and retweets regarding the fake tweets and save them to database"""

    twarc_conntector.change_params(window_limit=2, time_window=8)
    news_tweets = get_all_news_tweets(db, config)
    num_process = int(config["Selenium"]["num_process"])
    news_tweets_chunks = chunkify(news_tweets, num_process)
    multiprocess_function(num_process,
                          function_ref=fetch_save_news_tweets_details_job,
                          args=(news_tweets_chunks, config, twarc_conntector))

Example #2

0

Show file

File: FakeRulesFactCheck.py Project: yaotianzhang/MM-COVID

def get_from_facebook(db):
    news_collection = db[Constants.NEWS_COLLECTION]
    url_list = []
    num_process = 1
    for query in [{
            "ref_source_url": {
                "$regex": "archive"
            }
    }, {
            "ref_source_url": {
                "$regex": "facebook"
            }
    }]:
        for fb_url in news_collection.find(
                query, {
                    "news_id": 1,
                    "ref_source_url": 1,
                    "ref_source.ref_archive_url": 1,
                    "ref_source.text": 1
                }):
            # "ref_source.text":{"$in":['NONE']}
            text = ""
            if fb_url:
                if "ref_source" in fb_url.keys(
                ) and "ref_archive_url" in fb_url['ref_source'].keys():
                    ref_source_url = fb_url['ref_source']['ref_archive_url']
                    text = fb_url['ref_source'].get("text", "")
                else:
                    ref_source_url = fb_url['ref_source_url']
                filter_list = [
                    "Sorry",
                    "WATCH LIVE",
                    "you may know",
                    "See more",
                    "Anyone can see",
                    "This video may",
                    "more reply",
                    "Skip all",
                    "Sign up",
                    "in the group and what they post",
                    "log in to continue",
                    "having problems playing",
                    "# GiselleMaxwellslist became",
                ]
                if len(text.split()) < 10 or any(
                    [i in text for i in filter_list]):
                    url_list.append((fb_url['news_id'], ref_source_url))

    url_chunk_list = chunkify(url_list, num_process)
    multiprocess_function(num_process, job_from_facebook, (url_chunk_list, db))

Example #3

0

Show file

def fetch_save_user_profile(db, screen_name_user_id_list, twarc_connector):
    twarc_connector.change_params(window_limit=900, time_window=900)
    num_process = int(config["Selenium"]["num_process"])
    print("Length Before {}".format(len(screen_name_user_id_list)))
    screen_name_user_id_list = list(set(screen_name_user_id_list))
    print("Length After {}".format(len(screen_name_user_id_list)))
    user_name_chunks = chunkify(list(screen_name_user_id_list), num_process)

    logging.info("Total no. of users to fetch profile info : {}".format(
        len(screen_name_user_id_list)))

    multiprocess_function(num_process,
                          function_ref=fetch_save_user_profile_job,
                          args=(user_name_chunks, twarc_connector, db))

Example #4

0

Show file

File: FakeNewsRules.py Project: yaotianzhang/MM-COVID

def fact_check_crawler(name, page_count, db):

    if name == "poynter":
        main_url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/?covid_countries=0&covid_rating=0&covid_fact_checkers=0&orderby=views&order=DESC#038;covid_rating=0&covid_fact_checkers=0&orderby=views&order=DESC"

        crawl_func = page_in_poynter
    elif name == "snopes":
        # main_url = "https://www.snopes.com/fact-check/rating/false/page/{}"
        main_url = "https://www.snopes.com/fact-check/rating/false/page/{}"
        crawl_func = page_in_snope
    else:
        raise NotImplementedError

    news_collection = db[Constants.NEWS_COLLECTION]
    if page_count > 0:
        all_pages = list(range(1, page_count + 1, 1))
    else:
        all_pages = [-1]
    num_process = os.cpu_count() - 3
    all_pages_chunkify = chunkify(all_pages, num_process)
    multiprocess_function(num_process, function_ref=fetch_fact_check,
                          args=(all_pages_chunkify, main_url,
                                news_collection, crawl_func))

Example #5

0

Show file

File: RealRules.py Project: yaotianzhang/MM-COVID

def ReliableCrawler(name, page_count, db):
    if name == "cdc":
        main_url = "https://www.cdc.gov/media/archives.html?Sort=Article%20Date%3A%3Adesc&Page={}"
        crawl_func = page_in_cdc
    elif name == "who":
        main_url = "https://www.who.int/news-room/releases/{}"
        crawl_func = page_in_who
    elif name == "nih":
        main_url = "https://search.nih.gov/search/docs?affiliate=nih&dc=565&page={}&query=covid-19&utf8=%E2%9C%93"
        crawl_func = page_in_nih
    elif name == "webmd":
        main_url = "https://www.webmd.com/search/search_results/default.aspx?query=covid19&page={}"
        crawl_func = page_in_webMD
    elif name == "smithsonianmag":
        main_url = "https://www.smithsonianmag.com/search/?q=covid-19&page={}"
        crawl_func = page_in_smithsonianmag
    elif name == "science_daily":
        main_url = "https://www.sciencedaily.com/search/?keyword=covid19#gsc.tab=0&gsc.q=covid%2019%20site%3Awww.sciencedaily.com&gsc.sort=&gsc.page={}"
        crawl_func = page_in_science_daily
    elif name == "healthline":
        main_url = "https://www.healthline.com/health-news?ref=global"
        crawl_func = page_in_healthline

    elif name == "ecdc":
        main_url = "./crawled_data/ecdc.html"
        crawl_func = page_in_ecdc

    elif name == "mnt":
        main_url = "https://www.medicalnewstoday.com/coronavirus"
        crawl_func = page_in_MNT
    elif name == "mayo_clinic":
        main_url = "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963"
        crawl_func = page_in_mayo_clinic
    elif name == "celeveland":
        main_url = "https://newsroom.clevelandclinic.org/category/news-releases/page/{}/"
        crawl_func = page_in_cleveland_clink
    elif name == "snopes":
        main_url = "https://www.snopes.com/news/page/{}"
        crawl_func = page_in_snopes
    elif name == "politico":
        main_url = "https://www.politico.com/search/{}?q=covid19"
        crawl_func = page_in_politico
    elif name == "dn":
        main_url = "{}"
        crawl_func = page_in_dn
    elif name == "publico":
        main_url = "{}"
        crawl_func = page_in_publico
    elif name == "afp":
        main_url = "https://www.afp.com/fr/search/results/covid-19?page={}&f[0]=im_field_tags%3A74"
        crawl_func = page_in_afp
    elif name == "elpais":
        main_url = "https://elpais.com/noticias/covid-19/{}/"
        crawl_func = page_in_elpais
    elif name == "abces":
        main_url = "https://www.abc.es/hemeroteca/resultados-busqueda-avanzada/todo/pagina-{}?tod=covid&nin=19"
        crawl_func = page_in_abces
    elif name == "animalpolitico":
        main_url = "{}"
        crawl_func = page_in_animalpolitico
    elif name == "lemonde":
        main_url = "https://www.lemonde.fr/recherche/?search_keywords=covid-19&start_at=03/01/2020&end_at=26/07/2020&search_sort=relevance_desc&page={}"
        crawl_func = page_in_lemonde
    elif name == "jn":
        main_url = "{}"
        crawl_func = page_in_jn
    elif name == "publico":
        main_url = ""
        crawl_func = page_in_publico
    elif name == "milenio":
        main_url = "https://www.efe.com/efe/espana/busqueda/50000538?q=covid-19&p={}&s=0"
        crawl_func = page_in_milenio
    else:
        raise NotImplementedError

    # TODO: Automatically extract the page number
    if page_count > 0:
        all_pages = list(range(1, page_count + 1, 1))
    else:
        all_pages = [-1]
    num_process = os.cpu_count() - 3
    all_pages_chunkify = chunkify(all_pages, num_process)
    multiprocess_function(num_process,
                          function_ref=fetch_save_collection,
                          args=(all_pages_chunkify, main_url, db, crawl_func))

Example #6

0

Show file

def multiprocess_fetch_save_tweets(tweet_search_entries, twarc_connector, db):
    num_process = os.cpu_count() - 3
    tweet_search_entry_chunks = chunkify(tweet_search_entries, num_process)

    multiprocess_function(num_process, fetch_save_tweets_job,
                          (tweet_search_entry_chunks, twarc_connector, db))