def generate_job_postings_from_s3( s3_conn, s3_prefix: Text, ) -> JobPostingGeneratorType: """ Stream all job listings from s3 Args: s3_conn: a boto s3 connection s3_prefix: path to the job listings. Yields: string in json format representing the next job listing Refer to sample_job_listing.json for example structure """ retrier = Retrying(retry_on_exception=retry_if_io_error, wait_exponential_multiplier=100, wait_exponential_max=100000) bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) keys = bucket.list(prefix=prefix) for key in keys: logging.info('Extracting job postings from key {}'.format(key.name)) with BytesIO() as outfile: retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress) outfile.seek(0) for line in outfile: yield json.loads(line.decode('utf-8'))
def job_postings(s3_conn, quarter, s3_path, source="all"): """ Stream all job listings from s3 for a given quarter Args: s3_conn: a boto s3 connection quarter: a string representing a quarter (2015Q1) s3_path: path to the job listings. source: should be a string or a subset of "nlx", "va", "cb" or "all" Yields: string in json format representing the next job listing Refer to sample_job_listing.json for example structure """ retrier = Retrying( retry_on_exception=retry_if_io_error, wait_exponential_multiplier=100, wait_exponential_max=100000 ) bucket_name, prefix = split_s3_path(s3_path) bucket = s3_conn.get_bucket(bucket_name) # keys = bucket.list(prefix='{}/{}'.format(prefix, quarter)) if isinstance(source, str): if source.lower() == "all": keys = bucket.list(prefix='{}/{}'.format(prefix, quarter)) else: keys = bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, source.upper())) elif isinstance(source, list): keys = [] for s in source: keys.append(bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, s.upper()))) keys = chain(*keys) for key in keys: logging.info('Extracting job postings from key {}'.format(key.name)) with BytesIO() as outfile: retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress) outfile.seek(0) for line in outfile: yield line.decode('utf-8')
def api_call(self, path, headers=None, retries=10, **kwargs): timeout = kwargs.pop("timeout", 10.0) opener = urllib2.build_opener() opener.addheaders = [("User-Agent", self.user_agent), ("region-id", str(self.region))] req = urllib2.Request(self.API_URL.format(path), **kwargs) # Make the request, with retries retrier = Retrying(stop_max_attempt_number=retries, wait_exponential_multiplier=500, wait_exponential_max=5000, retry_on_exception=retry_if_http_error, wrap_exception=True) res = retrier.call(opener.open, req, timeout=timeout) return json.loads(res.read())
def _get(self, url, package): retry = Retrying(wait_exponential_multiplier=2000, wait_exponential_max=120000, retry_on_exception=_retry_msg) return retry.call(requests.get, url % package)