def run(total_hours=24, hourly_limit=300, shuffle=False, termlist=None): if termlist is None: termlist = space.load_termlist() total_requests = min(int(total_hours * hourly_limit), len(termlist)) total_time = 60 * 60 * min(total_hours, len(termlist) / hourly_limit) wait_time = total_time / total_requests daily_max_requests = hourly_limit * 24 try: import os os.mkdir('search_results') except Exception as e: # print("could not make directory", e) pass # not sure if shuffle is needed, if so try shuffling index term_indices = list(termlist.index) if shuffle: # raise NotImplementedError() print("shuffling termlist") random.shuffle(term_indices) if len(termlist) > daily_max_requests: print("Warning: termlist length is", len(termlist), "while max daily requests will be", daily_max_requests) if len(termlist) > total_requests: print( f"Warning: only querying {total_requests} of {len(termlist)} total terms (not enough time specified)" ) space.write_logs( f"querying {total_requests} terms for a minimum of {printable_time(seconds=total_time)}", verbose=True) google_img_count = 0 baidu_img_count = 0 google_fails = [] baidu_fails = [] results = ResultSetList() start_ts = time.time() for i in range(0, total_requests): start_iter_ts = time.time() try: term_idx = term_indices.pop() english_term = termlist.loc[term_idx].english chinese_term = termlist.loc[term_idx].chinese except Exception as e: print("out of terms", term_idx, str(e)) break try: label = termlist.loc[term_idx].label except Exception as e: label = "automated_scraper" result = ResultSet(english_term, chinese_term, label) print( f'request {i}, term idx {term_idx}: "{result.combined_term()}", (label: {label})' ) if not english_term: print("\tskipping Google for term (English term not present)") else: try: urls = query_google(english_term) # print(f"\tGoogle got {len(urls)} images") result.add(urls[:MAX_PICTURES_PER], GOOGLE) except Exception as e: google_fails.append(e) print("\tGoogle fail") if not chinese_term: print("\tskipping Baidu for term (Chinese term not present)") else: try: urls = query_baidu(chinese_term) # print(f"\tbaidu got {len(urls)} images") result.add(urls[:MAX_PICTURES_PER], BAIDU) except Exception as e: baidu_fails.append(e) print("\tBaidu fail") # print("done querying search engines for term", english_term) results.add(result) # account for the time the calls took took = time.time() - start_iter_ts # add in random jitter time_noise = random.random() * 2 - 1 # print("adding noise to wait time", printable_time(seconds=time_noise)) # cache results. this is a backup and not meant to be a reliable data store if i % 25 == 24: try: update_results(results) results.clear() google_img_count += results.wrote[GOOGLE] baidu_img_count += results.wrote[BAIDU] except Exception as e: import traceback print( "failed to write search results; waiting until next attempt:", e) exc = traceback.format_exc() print(str(exc)) time.sleep(max(0, wait_time - took + time_noise)) if results.length > 0: try: update_results(results) results.clear() google_img_count += results.wrote[GOOGLE] baidu_img_count += results.wrote[BAIDU] except Exception as e: import traceback exc = traceback.format_exc() print(exc) print("Failed to update search results, waiting 1 minute") time.sleep(60) update_results(results) results.clear() google_img_count += results.wrote[GOOGLE] baidu_img_count += results.wrote[BAIDU] space.write_logs( f'wrote {results.wrote["google"]} google images and {results.wrote[BAIDU]} baidu images', verbose=True) if len(baidu_fails) > 0 or len(google_fails) > 0: space.write_error( f"Baidu failures: {len(baidu_fails)}, Google failures: {len(google_fails)}" ) print("took", printable_time(seconds=time.time() - start_ts)) return (google_img_count, baidu_img_count, total_requests)
''' Given a set of searches that the scraper has created, post each individually to the /createSearch endpoint. spaces_interface.write_search_results should have created a new list in each result object of the Digital Ocean URLs ''' search_term_to_id = {} print(f"saving {results.length} search terms") for term,result in results.iterterm(): post_result = post_search(result, '192.168.0.1') if not post_result: raise Exception("failed to post result for term " + term) if len(result.urls[GOOGLE]) != len(result.get_datalake_urls(GOOGLE)): post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE)) else: post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE), result.urls[GOOGLE]) if len(result.urls[BAIDU]) != len(result.get_datalake_urls(BAIDU)): post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU)) else: post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU), result.urls[BAIDU]) search_term_to_id[result.combined_term()] = post_result["search_id"] return search_term_to_id if __name__ == "__main__": from results import ResultSet, ResultSetList import time result = ResultSet('bunny', '') result.add(['google.com', 'bunnies.io'], GOOGLE) result.set_datalake_urls(['datalake.com/google.com', 'datalake.com/bunnies.io'], GOOGLE) results = ResultSetList() results.add(result) save_search_results(results)
def __init__(self): self.results = ResultSet() #list of dictionaries containing results
class Connector(object): def __init__(self): self.results = ResultSet() #list of dictionaries containing results def set_auth_type(self, auth_type): """Set the authorization type for given web service Allowed types: anon: web service only provides anonymous access login: user must be logged in to use the web service mixed: web service allows both anonymous and user-based access """ pass def set_throttle(self, limit=None, units=None): """Set the request rate for the given service units: {'requests per second', 'requests per day', etc.} """ self.delay = 0 self.max_requests = 1e16 self.made_requests = 0 def throttle(f): """wrapper function for throttling web service requests""" def wrapper(self, *args, **kwargs): if self.made_requests < self.max_requests: time.sleep(self.delay) f(self, *args, **kwargs) self.made_requests += 1 else: raise Exception, 'maximum request limit reached' return wrapper def set_url(self, url_str): self.url = Template(url_str) def set_parser(self, output_format): """sets the object variable to the correct output stream parser""" self.output_parser = output_parsers.get(output_format, lambda x: x) def get_url_params(self): p = {} for attr_name, attr_value in self.__class__.__dict__.items(): if isinstance(attr_value, Input): p[attr_name] = getattr(attr_value, 'value') return p @throttle def fetch(self, config): for key, val in config.iteritems(): setattr(getattr(self, key), 'value', val) url = self.url.substitute(self.get_url_params()) r = requests.get(url) results = self.output_parser(r.text) self.parse_results(results) def fetchmany(self, config_list): for config in config_list: self.fetch(config) def parse_results(self): raise NotImplementedError def reset(self): self.results.clear_results()
def __init__(self): self.results = ResultSet() # list of dictionaries containing results
class Connector(object): def __init__(self): self.results = ResultSet() # list of dictionaries containing results def set_auth_type(self, auth_type): """Set the authorization type for given web service Allowed types: anon: web service only provides anonymous access login: user must be logged in to use the web service mixed: web service allows both anonymous and user-based access """ pass def set_throttle(self, limit=None, units=None): """Set the request rate for the given service units: {'requests per second', 'requests per day', etc.} """ self.delay = 0 self.max_requests = 1e16 self.made_requests = 0 def throttle(f): """wrapper function for throttling web service requests""" def wrapper(self, *args, **kwargs): if self.made_requests < self.max_requests: time.sleep(self.delay) f(self, *args, **kwargs) self.made_requests += 1 else: raise Exception, "maximum request limit reached" return wrapper def set_url(self, url_str): self.url = Template(url_str) def set_parser(self, output_format): """sets the object variable to the correct output stream parser""" self.output_parser = output_parsers.get(output_format, lambda x: x) def get_url_params(self): p = {} for attr_name, attr_value in self.__class__.__dict__.items(): if isinstance(attr_value, Input): p[attr_name] = getattr(attr_value, "value") return p @throttle def fetch(self, config): for key, val in config.iteritems(): setattr(getattr(self, key), "value", val) url = self.url.substitute(self.get_url_params()) r = requests.get(url) results = self.output_parser(r.text) self.parse_results(results) def fetchmany(self, config_list): for config in config_list: self.fetch(config) def parse_results(self): raise NotImplementedError def reset(self): self.results.clear_results()