from triv.io import datasources class FeverSource(datasources.DataSource): scheme = "fever" @staticmethod def input_stream(stream, size, url, params): return disco.func.map_input_stream(stream, size,url,params) @property def table_url(self): """Returns base_url and table as base_url:table if base_url has been set otherwise just table""" return ':'.join(filter(None, (self.rule.base_url, self.table))) def earliest_record_time(self): sources_rule = self.rule.rule_set.find_rule_by_target(self.rule.source.table) sources_source = sources_rule.source # TODO: Consider caching earliest_record_time so that dependent rules # don't cause more than one network query return sources_source.earliest_record_time() def segment_between(self, start, end): return None datasources.set_source_for_scheme(FeverSource,'')
self.acccess_key_id = urlparse.unquote(parsed_url.username) self.secret_access_key = urlparse.unquote(parsed_url.password) self.conn = boto.connect_s3(self.acccess_key_id, self.secret_access_key) self.bucket_name = parsed_url.hostname self.bucket = self.conn.get_bucket(self.bucket_name, validate=False) def earliest_record_time(self): # Grab and parse the first key # self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1) for key in self.bucket.get_all_keys(prefix=self.prefix + "/", delimiter="/", max_keys=1): parts = filter(None, key.name.split("/")[1:]) params = dict([entry.split("=", 1) for entry in parts]) date = params["dt"] return parser.parse(date) # if bucket is empty return now return datetime.utcnow() def segment_between(self, start, end): # TODO: this won't return a range of key's only the bucket that start's exactly # with the start time keys = self.bucket.list("{0}/dt={1}/".format(self.prefix, start.isoformat()), delimiter="/") seconds_good_for = 60 * 60 * 24 urls = [k.generate_url(seconds_good_for, force_http=True) for k in keys if k.size > 0] return urls datasources.set_source_for_scheme(S3Source, "s3")
key.bucket.name, key.key) def earliest_record_time(self): # Grab and parse the first key #self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1) for key in self.bucket.get_all_keys(prefix=self.prefix + '/', delimiter='/', max_keys=1): parts = filter(None, key.name.split('/')[1:]) params = dict([entry.split('=', 1) for entry in parts]) date = params['dt'] return parser.parse(date) # if bucket is empty return now return datetime.utcnow() def segment_between(self, start, end): # TODO: this won't return a range of key's only the bucket that start's exactly # with the start time keys = self.bucket.list("{0}/dt={1}/".format(self.prefix, start.isoformat()), delimiter='/') seconds_good_for = 60 * 60 * 24 urls = [ self.generate_url(k, force_http=True) for k in keys if k.size > 0 ] return urls datasources.set_source_for_scheme(S3Source, 's3')
import disco.func from triv.io import datasources class FeverSource(datasources.DataSource): scheme = "fever" @staticmethod def input_stream(stream, size, url, params): return disco.func.map_input_stream(stream, size, url, params) @property def table_url(self): """Returns base_url and table as base_url:table if base_url has been set otherwise just table""" return ':'.join(filter(None, (self.rule.base_url, self.table))) def earliest_record_time(self): sources_rule = self.rule.rule_set.find_rule_by_target( self.rule.source.table) sources_source = sources_rule.source # TODO: Consider caching earliest_record_time so that dependent rules # don't cause more than one network query return sources_source.earliest_record_time() def segment_between(self, start, end): return None datasources.set_source_for_scheme(FeverSource, '')
from datetime import datetime, timedelta from disco.schemes.scheme_http import input_stream as http_input_stream from triv.io import datasources class HTTPSource(datasources.DataSource): """Poll an http source""" @staticmethod def input_stream(stream, size, url, params): stream, size, url = http_input_stream(stream, size, url, params) params.headers = stream.headers if not hasattr(params, 'content_type'): params.content_type = stream.headers['content-type'] return stream def earliest_record_time(self): return datetime.utcnow() def segment_between(self, start, end): # TODO: do an http head and use that for the datetime return [self.parsed_url.geturl() + '#' + start.isoformat()] datasources.set_source_for_scheme(HTTPSource, 'http') datasources.set_source_for_scheme(HTTPSource, 'https')
self.conn.server_name(self.conn.port), key.bucket.name, key.key ) def earliest_record_time(self): # Grab and parse the first key #self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1) for key in self.bucket.get_all_keys(prefix=self.prefix + '/', delimiter='/', max_keys=1): parts = filter(None, key.name.split('/')[1:]) params = dict([entry.split('=',1) for entry in parts]) date = params['dt'] return parser.parse(date) # if bucket is empty return now return datetime.utcnow() def segment_between(self, start, end): # TODO: this won't return a range of key's only the bucket that start's exactly # with the start time keys = self.bucket.list("{0}/dt={1}/".format(self.prefix, start.isoformat()), delimiter='/') seconds_good_for = 60*60*24 urls = [self.generate_url(k,force_http=True) for k in keys if k.size > 0] return urls datasources.set_source_for_scheme(S3Source,'s3')
from datetime import datetime, timedelta from disco.schemes.scheme_http import input_stream as http_input_stream from triv.io import datasources class HTTPSource(datasources.DataSource): """Poll an http source""" @staticmethod def input_stream(stream, size, url, params): stream, size, url = http_input_stream(stream, size,url,params) params.headers = stream.headers params.content_type = stream.headers['content-type'] return stream def earliest_record_time(self): return datetime.utcnow() def segment_between(self, start, end): # TODO: do an http head and use that for the datetime return [self.parsed_url.geturl() + '#' + start.isoformat()] datasources.set_source_for_scheme(HTTPSource,'http') datasources.set_source_for_scheme(HTTPSource,'https')
return datetime.utcnow() def segment_between(self, start, end): '''Returns a segment whose urls are all files fonud with the job_path. Note that the scheme repo:// is retained and all files are relative to this. The input_stream, used in the worker, locate these files relative to the worker's job directory. ''' prefix_len = len(self.rule.job_path) path = self.rule.path(self.parsed_url.path) urls = [] for root, dirs, files in os.walk(path): for i,d in enumerate(dirs[:]): if d.startswith('.'): del dirs[i] dir = root[prefix_len:] for file in files: url = 'repo:/' + os.path.join(dir,file) urls.append(url) return urls datasources.set_source_for_scheme(RepoSource,'repo')
return self.dtstart def sample(self, start=None): """ Returns an input_stream and url. Since the output of a mock is meant mostly for testing components, we return the same values for the "sample" """ scheme, netloc, path, params, query, fragment = self.parsed_url if start is not None: fragment = start.isoformat() else: fragment = '' return self.input_stream, [ urlunparse((scheme, netloc, path, params, query, fragment)) ] def segment_between(self, start, end): '''Return a list of url's that belong in the given time range. Note all information needed to access a url must be encoded into the url''' reader, urls = self.sample(start) return urls datasources.set_source_for_scheme(MockSource, 'mock')
return self.dtstart def sample(self, start=None): """ Returns an input_stream and url. Since the output of a mock is meant mostly for testing components, we return the same values for the "sample" """ scheme, netloc, path, params, query, fragment = self.parsed_url if start is not None: fragment = start.isoformat() else: fragment = '' return self.input_stream, [urlunparse((scheme, netloc, path, params, query, fragment))] def segment_between(self, start, end): '''Return a list of url's that belong in the given time range. Note all information needed to access a url must be encoded into the url''' reader, urls = self.sample(start) return urls datasources.set_source_for_scheme(MockSource,'mock')