def scrape(cores, db_name): """ INPUT: cores (int) - Number of cores to use for MP. OUTPUT: None. Wrapper function to begin scraping list of URLs gathered from Yelp API which have not yet been scraped. Calls scrape_func. """ if ip_test(PROXY_IP): nyc_rest = YelpDatabase(database_name=db_name, cat_filt="restaurants") df_yelp = nyc_rest.get_full_df() df_yelp_lim = df_yelp[df_yelp.review_count >= 20] all_sorted_urls = df_yelp_lim[['review_count', 'url']]\ .sort('review_count', ascending=False)\ .drop_duplicates('url').url.values yelp_scraped = set(mongo_connect(db_name, "rest_scrape") .distinct('url')) urls = [x for x in all_sorted_urls if x not in yelp_scraped] print "Remaining: {0}".format(str(len(urls))) p = Pool(cores) p.map(scrape_func, urls) else: print "Connect to proxy!"
def __init__(self, database_name, cat_filt): YelpDatabase.__init__(self, database_name, cat_filt)