Exemple #1
0
from triv.io import datasources


class FeverSource(datasources.DataSource):
  scheme       = "fever"
      
  @staticmethod
  def input_stream(stream, size, url, params):
    return disco.func.map_input_stream(stream, size,url,params)
      
  @property
  def table_url(self):
    """Returns base_url and table as base_url:table if base_url has been set otherwise just table"""
    return ':'.join(filter(None, (self.rule.base_url, self.table)))


  def earliest_record_time(self):
    sources_rule = self.rule.rule_set.find_rule_by_target(self.rule.source.table)
    sources_source = sources_rule.source
    # TODO: Consider caching earliest_record_time so that dependent rules
    # don't cause more than one network query
    return sources_source.earliest_record_time()
    
    
  def segment_between(self, start, end):
    return None


datasources.set_source_for_scheme(FeverSource,'')
Exemple #2
0
        self.acccess_key_id = urlparse.unquote(parsed_url.username)
        self.secret_access_key = urlparse.unquote(parsed_url.password)

        self.conn = boto.connect_s3(self.acccess_key_id, self.secret_access_key)
        self.bucket_name = parsed_url.hostname
        self.bucket = self.conn.get_bucket(self.bucket_name, validate=False)

    def earliest_record_time(self):
        # Grab and parse the first key
        # self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1)
        for key in self.bucket.get_all_keys(prefix=self.prefix + "/", delimiter="/", max_keys=1):
            parts = filter(None, key.name.split("/")[1:])
            params = dict([entry.split("=", 1) for entry in parts])
            date = params["dt"]
            return parser.parse(date)

        # if bucket is empty return now
        return datetime.utcnow()

    def segment_between(self, start, end):
        # TODO: this won't return a range of key's only the bucket that start's exactly
        # with the start time
        keys = self.bucket.list("{0}/dt={1}/".format(self.prefix, start.isoformat()), delimiter="/")
        seconds_good_for = 60 * 60 * 24

        urls = [k.generate_url(seconds_good_for, force_http=True) for k in keys if k.size > 0]
        return urls


datasources.set_source_for_scheme(S3Source, "s3")
Exemple #3
0
                key.bucket.name, key.key)

    def earliest_record_time(self):
        # Grab and parse the first key
        #self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1)
        for key in self.bucket.get_all_keys(prefix=self.prefix + '/',
                                            delimiter='/',
                                            max_keys=1):
            parts = filter(None, key.name.split('/')[1:])
            params = dict([entry.split('=', 1) for entry in parts])
            date = params['dt']
            return parser.parse(date)

        # if bucket is empty return now
        return datetime.utcnow()

    def segment_between(self, start, end):
        # TODO: this won't return a range of key's only the bucket that start's exactly
        # with the start time
        keys = self.bucket.list("{0}/dt={1}/".format(self.prefix,
                                                     start.isoformat()),
                                delimiter='/')
        seconds_good_for = 60 * 60 * 24
        urls = [
            self.generate_url(k, force_http=True) for k in keys if k.size > 0
        ]
        return urls


datasources.set_source_for_scheme(S3Source, 's3')
Exemple #4
0
import disco.func

from triv.io import datasources


class FeverSource(datasources.DataSource):
    scheme = "fever"

    @staticmethod
    def input_stream(stream, size, url, params):
        return disco.func.map_input_stream(stream, size, url, params)

    @property
    def table_url(self):
        """Returns base_url and table as base_url:table if base_url has been set otherwise just table"""
        return ':'.join(filter(None, (self.rule.base_url, self.table)))

    def earliest_record_time(self):
        sources_rule = self.rule.rule_set.find_rule_by_target(
            self.rule.source.table)
        sources_source = sources_rule.source
        # TODO: Consider caching earliest_record_time so that dependent rules
        # don't cause more than one network query
        return sources_source.earliest_record_time()

    def segment_between(self, start, end):
        return None


datasources.set_source_for_scheme(FeverSource, '')
Exemple #5
0
from datetime import datetime, timedelta
from disco.schemes.scheme_http import input_stream as http_input_stream

from triv.io import datasources


class HTTPSource(datasources.DataSource):
    """Poll an http source"""
    @staticmethod
    def input_stream(stream, size, url, params):
        stream, size, url = http_input_stream(stream, size, url, params)
        params.headers = stream.headers
        if not hasattr(params, 'content_type'):
            params.content_type = stream.headers['content-type']
        return stream

    def earliest_record_time(self):
        return datetime.utcnow()

    def segment_between(self, start, end):
        # TODO: do an http head and use that for the datetime
        return [self.parsed_url.geturl() + '#' + start.isoformat()]


datasources.set_source_for_scheme(HTTPSource, 'http')
datasources.set_source_for_scheme(HTTPSource, 'https')
Exemple #6
0
        self.conn.server_name(self.conn.port),
        key.bucket.name, key.key
      )

      



  def earliest_record_time(self):
    # Grab and parse the first key
    #self.bucket.get_all_keys(self.prefix + '/', delimiter='/', max_keys=1)
    for key in self.bucket.get_all_keys(prefix=self.prefix + '/', delimiter='/', max_keys=1):
      parts = filter(None, key.name.split('/')[1:])
      params = dict([entry.split('=',1) for entry in parts])
      date = params['dt']
      return parser.parse(date)
    
    # if bucket is empty return now
    return datetime.utcnow()

  def segment_between(self, start, end):
    # TODO: this won't return a range of key's only the bucket that start's exactly
    # with the start time
    keys = self.bucket.list("{0}/dt={1}/".format(self.prefix, start.isoformat()), delimiter='/')
    seconds_good_for = 60*60*24
    urls = [self.generate_url(k,force_http=True) for k in keys if k.size > 0]
    return urls

    
datasources.set_source_for_scheme(S3Source,'s3')
Exemple #7
0
from datetime import datetime, timedelta
from disco.schemes.scheme_http import input_stream as http_input_stream

from triv.io import datasources

class HTTPSource(datasources.DataSource):
  """Poll an http source"""
        
  @staticmethod
  def input_stream(stream, size, url, params):
    stream, size, url = http_input_stream(stream, size,url,params)
    params.headers = stream.headers
    params.content_type = stream.headers['content-type']
    return stream
    
  def earliest_record_time(self):
    return datetime.utcnow()
    
  def segment_between(self, start, end):
    # TODO: do an http head and use that for the datetime
    return [self.parsed_url.geturl() + '#' + start.isoformat()]

datasources.set_source_for_scheme(HTTPSource,'http')
datasources.set_source_for_scheme(HTTPSource,'https')
Exemple #8
0
    return datetime.utcnow()
    
    
  def segment_between(self, start, end):
    '''Returns a segment whose urls are all files fonud with the job_path.
    
    Note that the scheme repo:// is retained and all files are relative to this.
    
    The input_stream, used in the worker, locate these files relative to the worker's job
    directory.
  
    '''

    prefix_len = len(self.rule.job_path)
    path = self.rule.path(self.parsed_url.path)
    
    urls = []
    for root, dirs, files in os.walk(path):
      for i,d in enumerate(dirs[:]):
        if d.startswith('.'):
          del dirs[i]
        
      dir = root[prefix_len:]
      
      for file in files:
        url = 'repo:/' + os.path.join(dir,file)
        urls.append(url)
    return urls
  
datasources.set_source_for_scheme(RepoSource,'repo')
Exemple #9
0
        return self.dtstart

    def sample(self, start=None):
        """
    Returns an input_stream and url.
    
    Since the output of a mock is meant mostly for testing components, 
    we return the same values for the "sample"
    
    """
        scheme, netloc, path, params, query, fragment = self.parsed_url
        if start is not None:
            fragment = start.isoformat()
        else:
            fragment = ''

        return self.input_stream, [
            urlunparse((scheme, netloc, path, params, query, fragment))
        ]

    def segment_between(self, start, end):
        '''Return a list of url's that belong in the given time range. Note all
    information needed to access a url must be encoded into the url'''

        reader, urls = self.sample(start)
        return urls


datasources.set_source_for_scheme(MockSource, 'mock')
Exemple #10
0
    
    return self.dtstart
    
    
  def sample(self, start=None):
    """
    Returns an input_stream and url.
    
    Since the output of a mock is meant mostly for testing components, 
    we return the same values for the "sample"
    
    """
    scheme, netloc, path, params, query, fragment = self.parsed_url
    if start is not None:
      fragment = start.isoformat()
    else:
      fragment = ''
    
    return self.input_stream, [urlunparse((scheme, netloc, path, params, query, fragment))]

    
  def segment_between(self, start, end):
    '''Return a list of url's that belong in the given time range. Note all
    information needed to access a url must be encoded into the url'''
    
    reader, urls = self.sample(start)
    return urls
    
  
datasources.set_source_for_scheme(MockSource,'mock')
Exemple #11
0
    return datetime.utcnow()
    
    
  def segment_between(self, start, end):
    '''Returns a segment whose urls are all files fonud with the job_path.
    
    Note that the scheme repo:// is retained and all files are relative to this.
    
    The input_stream, used in the worker, locate these files relative to the worker's job
    directory.
  
    '''

    prefix_len = len(self.rule.job_path)
    path = self.rule.path(self.parsed_url.path)
    
    urls = []
    for root, dirs, files in os.walk(path):
      for i,d in enumerate(dirs[:]):
        if d.startswith('.'):
          del dirs[i]
        
      dir = root[prefix_len:]
      
      for file in files:
        url = 'repo:/' + os.path.join(dir,file)
        urls.append(url)
    return urls
  
datasources.set_source_for_scheme(RepoSource,'repo')