Exemple #1
0
    def test_parse_file_source(self):
        d = b''.join([self.hm, self.hg])
        add = b"a:17:32:%d\n%s" % (len(d), d)
        sub = b"s:18:32:%d\n%s" % (len(d), d)

        adds = [
            Chunk(chunk_type='a',
                  number=17,
                  hashes=set([self.hg, self.hm]),
                  hash_size=32)
        ]
        subs = [
            Chunk(chunk_type='s',
                  number=18,
                  hashes=set([self.hg, self.hm]),
                  hash_size=32)
        ]

        self.assertEqual(parse_file_source(io.BytesIO(add)),
                         ChunkList(add_chunks=adds))
        self.assertEqual(parse_file_source(io.BytesIO(sub)),
                         ChunkList(sub_chunks=subs))
        # Both adds and subs with a spurious newline in between
        both = b"%s\n%s" % (add, sub)
        self.assertEqual(parse_file_source(io.BytesIO(both)),
                         ChunkList(add_chunks=adds, sub_chunks=subs))
Exemple #2
0
 def __init__(self, source_url, refresh_interval):
     self.source_url = source_url
     self.url = urlparse(self.source_url)
     self.interval = int(refresh_interval)
     self.last_refresh = 0
     self.last_check = 0
     # Initialize with an empty data set so we can always continue to serve
     self.chunks = ChunkList()
     self.chunk_index = {'adds': set(()), 'subs': set(())}
     self.prefixes = None
     self.no_data = True
Exemple #3
0
class Source(object):
    """
    Base class for data sources
    """

    def __init__(self, source_url, refresh_interval):
        self.source_url = source_url
        self.url = urlparse(self.source_url)
        self.interval = int(refresh_interval)
        self.last_refresh = 0
        self.last_check = 0
        # Initialize with an empty data set so we can always continue to serve
        self.chunks = ChunkList()
        self.chunk_index = {'adds': set(()), 'subs': set(())}
        self.prefixes = None
        self.no_data = True

    def load(self):
        raise NotImplementedError

    def _populate_chunks(self, fp, parser_func, *args, **kwargs):
        try:
            self.chunks = parser_func(fp, *args, **kwargs)
            self.last_check = int(time.time())
            self.last_refresh = int(time.time())
            self.chunk_index = {'adds': set(self.chunks.adds.keys()),
                                'subs': set(self.chunks.subs.keys())}
        except ParseError as e:
            raise ParseError('Error parsing "%s": %s' % (self.url.path, e))

    def refresh(self):
        # Prevent constant refresh checks
        now = int(time.time())
        if now - self.interval >= self.last_check:
            self.last_check = now
            if self.needs_refresh():
                self.load()
        return False

    def needs_refresh(self):
        return False

    def fetch(self, adds, subs):
        self.refresh()

        chunks = {'adds': [], 'subs': []}
        for chunk_num in adds:
            chunks['adds'].append(self.chunks.adds[chunk_num])
        for chunk_num in subs:
            chunks['subs'].append(self.chunks.subs[chunk_num])
        return chunks

    def list_chunks(self):
        self.refresh()
        return (self.chunk_index['adds'], self.chunk_index['subs'])

    def find_prefix(self, prefix):
        return self.chunks.find_prefix(prefix)
Exemple #4
0
    def test_load(self):
        with mock_s3():
            conn = boto.connect_s3()
            b = conn.create_bucket(self.bucket_name)
            k = Key(b)
            k.name = self.key_name
            k.set_contents_from_string(self.add + b'\n' + self.sub)

            f = S3FileSource("s3+file://{0}/{1}".format(self.bucket_name,
                                                        self.key_name),
                             0.5)
            f.load()
            self.assertEqual(f.chunks, ChunkList(add_chunks=simple_adds,
                                                 sub_chunks=simple_subs))
Exemple #5
0
    _d = b''.join([hm, hg])
    add = b"a:17:32:%d\n%s" % (len(_d), _d)
    sub = b"s:18:32:%d\n%s" % (len(_d), _d)

    def setUp(self):
        self.maxDiff = None
        super(ShavarTestCase, self).setUp()

    def get_configurator(self):
        config = super(ShavarTestCase, self).get_configurator()
        config.include("shavar")
        return config


def chunkit(n, typ, *urls):
    return Chunk(number=n,
                 chunk_type=typ,
                 hashes=[hashlib.sha256(u.encode()).digest() for u in urls])


DELTA_RESULT = ChunkList(add_chunks=[
    chunkit(1, 'a', 'https://www.mozilla.org/', 'https://www.google.com/'),
    chunkit(2, 'a', 'https://github.com/', 'http://www.python.org/'),
    chunkit(4, 'a', 'http://www.haskell.org/', 'https://www.mozilla.com/'),
    chunkit(5, 'a', 'http://www.erlang.org', 'http://golang.org/')
],
                         sub_chunks=[
                             chunkit(3, 's', 'https://github.com/'),
                             chunkit(6, 's', 'http://golang.org')
                         ])
Exemple #6
0
def parse_file_source(handle):
    """
    Parses a chunk list formatted file
    """
    # We should almost certainly* find the end of the first newline within the
    # first 32 bytes of the file.  It consists of a colon delimited string
    # with the following members:
    #
    #  - type of chunk: 'a' or 's' == 1
    #  - chunk number:  assuming len(2**32) == max of 10
    #  - number of bytes in the hash prefix size: 4 bytes for shavar or
    #                                             32 digest256 == max of 2
    #  - length of the raw data following in octets: len(2**32) == max of 10
    #
    #  These total 23 plus 3 bytes for colons plus one byte for the newline
    #  bring the grand total for likely maximum length to 27 with a minimum
    #  of 8 bytes("1:1:4:1\n").
    #
    #  So 32 byte read should be more than sufficient.
    #
    # * If 64 bit ints get involved, there are other issues to address

    parsed = ChunkList()
    while True:
        blob = handle.read(32)

        # Consume any unnecessary newlines in front of chunks
        blob = blob.lstrip('\n')

        if not blob:
            break

        if len(blob) < 8:
            raise ParseError("Incomplete chunk file? Could only read %d "
                             "bytes of header." % len(blob))

        eol = blob.find('\n')
        if eol < 8:
            raise ParseError('Impossibly short chunk header: "%s"' % eol)
        header = blob[:eol]

        if header.count(':') != 3:
            raise ParseError('Incorrect number of fields in chunk header: '
                             '"%s"' % header)

        add_sub, chunk_num, hash_len, read_len = header.split(':', 3)

        if len(add_sub) != 1:
            raise ParseError('Chunk type is too long: "%s"' % header)
        if add_sub not in ('a', 's'):
            raise ParseError('Invalid chunk type: "%s"' % header)

        try:
            chunk_num = int(chunk_num)
            hash_len = int(hash_len)
            read_len = int(read_len)
        except ValueError:
            raise ParseError('Non-integer chunk values: "%s"' % header)

        if read_len % hash_len != 0:
            raise ParseError('Chunk data length not a multiple of prefix '
                             'size: "%s"' % header)

        blob = blob[eol + 1:]
        blob += handle.read(read_len - len(blob))
        if blob is None or len(blob) < read_len:
            raise ParseError('Chunk data truncated for chunk %d' % chunk_num)

        hashes = []
        pos = 0
        while pos < read_len:
            hashes.append(blob[pos:pos + hash_len])
            pos += hash_len

        parsed.insert_chunk(Chunk(chunk_type=add_sub, number=chunk_num,
                                  hashes=hashes))

    return parsed
Exemple #7
0
    except ValueError, e:
        raise ParseError("Could not parse index file: %s" % e)

    if 'name' not in index:
            raise ParseError("Incorrectly formatted index: missing list name")

    if 'chunks' not in index:
        raise ParseError("Incorrectly formatted index: missing chunks")

    if 'basedir' in index:
        basedir = posixpath.join(os.path.dirname(handle.name),
                                 index['basedir'])
    else:
        basedir = os.path.dirname(handle.name)

    parsed = ChunkList()
    for key in index['chunks'].iterkeys():
        # A little massaging to make the data structure a little cleaner
        try:
            index['chunks'][int(key)] = index['chunks'][key]
            del index['chunks'][key]
        except KeyError:
            raise ParseError("Some weird behaviour with the list of chunks "
                             "in \"%s\"" % handle.filename)

        chunk_file = posixpath.join(basedir, key)

        if not exists_cb(chunk_file):
            raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file)

        with open_cb(chunk_file, 'rb') as f:
Exemple #8
0
def parse_dir_source(handle, exists_cb=os.path.exists, open_cb=open):
    """
    Expects a file alike object with the contents of a JSON formatted index
    file that has the following structure:

    {
        "name": "mozpub-tracking-digest256",
        "basedir": "mozpub-tracking-digest256",
        "chunks": {
            "1": {
                "path": "mozpub-tracking-digest256/1",
                "hashes(optional)": [ "", "" ],
                "prefixes(optional)": [ "", "" ]
            },
            "2": {
                "path": "mozpub-tracking-digest256/2",
                "hashes": [ "", "" ],
                "prefixes": [ "", "" ]
            }
        }
    }

    The basedir, hashes, and prefixes entries are optional.  The chunks to be
    served will be parsed with parse_file_source().  If hashes and prefixes are
    provided, they will be verified against the data provided in the given
    chunk file.
    """
    try:
        index = json.load(handle)
    except ValueError as e:
        raise ParseError("Could not parse index file: %s" % e)

    if 'name' not in index:
        raise ParseError("Incorrectly formatted index: missing list name")

    if 'chunks' not in index:
        raise ParseError("Incorrectly formatted index: missing chunks")

    if 'basedir' in index:
        basedir = posixpath.join(os.path.dirname(handle.name),
                                 index['basedir'])
    else:
        handle_name = handle.name
        if isinstance(handle_name, int):
            handle_name = str(handle_name)
        basedir = os.path.dirname(handle_name)

    parsed = ChunkList()
    int_key_chunks = {}
    for key in index['chunks'].keys():
        # A little massaging to make the data structure a little cleaner
        try:
            int_key_chunks[int(key)] = index['chunks'][key]
        except KeyError:
            raise ParseError("Some weird behaviour with the list of chunks "
                             "in \"%s\"" % handle.filename)
        chunk_file = posixpath.join(basedir, str(key))

        if not exists_cb(chunk_file):
            raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file)

        with open_cb(chunk_file, 'rb') as f:
            chunk_list = parse_file_source(f)

        # Only one chunk per file
        if len(chunk_list) > 1:
            raise ParseError("More than one chunk in chunk file \"%s\"" %
                             chunk_file)

        for chunk in itertools.chain(iter(chunk_list.adds.values()),
                                     iter(chunk_list.subs.values())):
            parsed.insert_chunk(chunk)
    index['chunks'] = int_key_chunks
    return parsed
Exemple #9
0
def parse_file_source(handle):
    """
    Parses a chunk list formatted file
    """
    # We should almost certainly* find the end of the first newline within the
    # first 32 bytes of the file.  It consists of a colon delimited string
    # with the following members:
    #
    #  - type of chunk: 'a' or 's' == 1
    #  - chunk number:  assuming len(2**32) == max of 10
    #  - number of bytes in the hash prefix size: 4 bytes for shavar or
    #                                             32 digest256 == max of 2
    #  - length of the raw data following in octets: len(2**32) == max of 10
    #
    #  These total 23 plus 3 bytes for colons plus one byte for the newline
    #  bring the grand total for likely maximum length to 27 with a minimum
    #  of 8 bytes("1:1:4:1\n").
    #
    #  So 32 byte read should be more than sufficient.
    #
    # * If 64 bit ints get involved, there are other issues to address

    parsed = ChunkList()
    while True:
        blob = handle.read(32)

        # Consume any unnecessary newlines in front of chunks
        blob = blob.lstrip(b'\n')

        if not blob:
            break

        if len(blob) < 8:
            raise ParseError("Incomplete chunk file? Could only read %d "
                             "bytes of header." % len(blob))

        eol = blob.find(b'\n')
        if eol < 8:
            raise ParseError('Impossibly short chunk header: "%s"' % eol)
        header = get_header(blob, eol)

        if header.count(':') != 3:
            raise ParseError('Incorrect number of fields in chunk header: '
                             '"%s"' % header)

        add_sub, chunk_num, hash_len, read_len = header.split(':', 3)

        if len(add_sub) != 1:
            raise ParseError('Chunk type is too long: "%s"' % header)
        if add_sub not in ('a', 's'):
            raise ParseError('Invalid chunk type: "%s"' % header)

        try:
            chunk_num = int(chunk_num)
            hash_len = int(hash_len)
            read_len = int(read_len)
        except ValueError:
            raise ParseError('Non-integer chunk values: "%s"' % header)

        if read_len % hash_len != 0:
            raise ParseError('Chunk data length not a multiple of prefix '
                             'size: "%s"' % header)

        blob = blob[eol + 1:]
        blob += handle.read(read_len - len(blob))
        if blob is None or len(blob) < read_len:
            raise ParseError('Chunk data truncated for chunk %d' % chunk_num)

        hashes = []
        pos = 0
        while pos < read_len:
            hashes.append(blob[pos:pos + hash_len])
            pos += hash_len

        parsed.insert_chunk(
            Chunk(chunk_type=add_sub, number=chunk_num, hashes=hashes))

    return parsed
Exemple #10
0
 def test_load(self):
     f = FileSource("file://" + self.source.name, 1)
     f.load()
     self.assertEqual(
         f.chunks, ChunkList(add_chunks=simple_adds,
                             sub_chunks=simple_subs))
    except ValueError, e:
        raise ParseError("Could not parse index file: %s" % e)

    if 'name' not in index:
        raise ParseError("Incorrectly formatted index: missing list name")

    if 'chunks' not in index:
        raise ParseError("Incorrectly formatted index: missing chunks")

    if 'basedir' in index:
        basedir = posixpath.join(os.path.dirname(handle.name),
                                 index['basedir'])
    else:
        basedir = os.path.dirname(handle.name)

    parsed = ChunkList()
    for key in index['chunks'].iterkeys():
        # A little massaging to make the data structure a little cleaner
        try:
            index['chunks'][int(key)] = index['chunks'][key]
            del index['chunks'][key]
        except KeyError:
            raise ParseError("Some weird behaviour with the list of chunks "
                             "in \"%s\"" % handle.filename)

        chunk_file = posixpath.join(basedir, key)

        if not exists_cb(chunk_file):
            raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file)

        with open_cb(chunk_file, 'rb') as f: