Esempio n. 1
0
    def test_parse_file_source(self):
        d = b''.join([self.hm, self.hg])
        add = b"a:17:32:%d\n%s" % (len(d), d)
        sub = b"s:18:32:%d\n%s" % (len(d), d)

        adds = [
            Chunk(chunk_type='a',
                  number=17,
                  hashes=set([self.hg, self.hm]),
                  hash_size=32)
        ]
        subs = [
            Chunk(chunk_type='s',
                  number=18,
                  hashes=set([self.hg, self.hm]),
                  hash_size=32)
        ]

        self.assertEqual(parse_file_source(io.BytesIO(add)),
                         ChunkList(add_chunks=adds))
        self.assertEqual(parse_file_source(io.BytesIO(sub)),
                         ChunkList(sub_chunks=subs))
        # Both adds and subs with a spurious newline in between
        both = b"%s\n%s" % (add, sub)
        self.assertEqual(parse_file_source(io.BytesIO(both)),
                         ChunkList(add_chunks=adds, sub_chunks=subs))
Esempio n. 2
0
 def __init__(self, source_url, refresh_interval):
     self.source_url = source_url
     self.url = urlparse(self.source_url)
     self.interval = int(refresh_interval)
     self.last_refresh = 0
     self.last_check = 0
     # Initialize with an empty data set so we can always continue to serve
     self.chunks = ChunkList()
     self.chunk_index = {'adds': set(()), 'subs': set(())}
     self.prefixes = None
     self.no_data = True
Esempio n. 3
0
    def test_load(self):
        with mock_s3():
            conn = boto.connect_s3()
            b = conn.create_bucket(self.bucket_name)
            k = Key(b)
            k.name = self.key_name
            k.set_contents_from_string(self.add + b'\n' + self.sub)

            f = S3FileSource("s3+file://{0}/{1}".format(self.bucket_name,
                                                        self.key_name),
                             0.5)
            f.load()
            self.assertEqual(f.chunks, ChunkList(add_chunks=simple_adds,
                                                 sub_chunks=simple_subs))
Esempio n. 4
0
    _d = b''.join([hm, hg])
    add = b"a:17:32:%d\n%s" % (len(_d), _d)
    sub = b"s:18:32:%d\n%s" % (len(_d), _d)

    def setUp(self):
        self.maxDiff = None
        super(ShavarTestCase, self).setUp()

    def get_configurator(self):
        config = super(ShavarTestCase, self).get_configurator()
        config.include("shavar")
        return config


def chunkit(n, typ, *urls):
    return Chunk(number=n,
                 chunk_type=typ,
                 hashes=[hashlib.sha256(u.encode()).digest() for u in urls])


DELTA_RESULT = ChunkList(add_chunks=[
    chunkit(1, 'a', 'https://www.mozilla.org/', 'https://www.google.com/'),
    chunkit(2, 'a', 'https://github.com/', 'http://www.python.org/'),
    chunkit(4, 'a', 'http://www.haskell.org/', 'https://www.mozilla.com/'),
    chunkit(5, 'a', 'http://www.erlang.org', 'http://golang.org/')
],
                         sub_chunks=[
                             chunkit(3, 's', 'https://github.com/'),
                             chunkit(6, 's', 'http://golang.org')
                         ])
Esempio n. 5
0
def parse_dir_source(handle, exists_cb=os.path.exists, open_cb=open):
    """
    Expects a file alike object with the contents of a JSON formatted index
    file that has the following structure:

    {
        "name": "mozpub-tracking-digest256",
        "basedir": "mozpub-tracking-digest256",
        "chunks": {
            "1": {
                "path": "mozpub-tracking-digest256/1",
                "hashes(optional)": [ "", "" ],
                "prefixes(optional)": [ "", "" ]
            },
            "2": {
                "path": "mozpub-tracking-digest256/2",
                "hashes": [ "", "" ],
                "prefixes": [ "", "" ]
            }
        }
    }

    The basedir, hashes, and prefixes entries are optional.  The chunks to be
    served will be parsed with parse_file_source().  If hashes and prefixes are
    provided, they will be verified against the data provided in the given
    chunk file.
    """
    try:
        index = json.load(handle)
    except ValueError as e:
        raise ParseError("Could not parse index file: %s" % e)

    if 'name' not in index:
        raise ParseError("Incorrectly formatted index: missing list name")

    if 'chunks' not in index:
        raise ParseError("Incorrectly formatted index: missing chunks")

    if 'basedir' in index:
        basedir = posixpath.join(os.path.dirname(handle.name),
                                 index['basedir'])
    else:
        handle_name = handle.name
        if isinstance(handle_name, int):
            handle_name = str(handle_name)
        basedir = os.path.dirname(handle_name)

    parsed = ChunkList()
    int_key_chunks = {}
    for key in index['chunks'].keys():
        # A little massaging to make the data structure a little cleaner
        try:
            int_key_chunks[int(key)] = index['chunks'][key]
        except KeyError:
            raise ParseError("Some weird behaviour with the list of chunks "
                             "in \"%s\"" % handle.filename)
        chunk_file = posixpath.join(basedir, str(key))

        if not exists_cb(chunk_file):
            raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file)

        with open_cb(chunk_file, 'rb') as f:
            chunk_list = parse_file_source(f)

        # Only one chunk per file
        if len(chunk_list) > 1:
            raise ParseError("More than one chunk in chunk file \"%s\"" %
                             chunk_file)

        for chunk in itertools.chain(iter(chunk_list.adds.values()),
                                     iter(chunk_list.subs.values())):
            parsed.insert_chunk(chunk)
    index['chunks'] = int_key_chunks
    return parsed
Esempio n. 6
0
def parse_file_source(handle):
    """
    Parses a chunk list formatted file
    """
    # We should almost certainly* find the end of the first newline within the
    # first 32 bytes of the file.  It consists of a colon delimited string
    # with the following members:
    #
    #  - type of chunk: 'a' or 's' == 1
    #  - chunk number:  assuming len(2**32) == max of 10
    #  - number of bytes in the hash prefix size: 4 bytes for shavar or
    #                                             32 digest256 == max of 2
    #  - length of the raw data following in octets: len(2**32) == max of 10
    #
    #  These total 23 plus 3 bytes for colons plus one byte for the newline
    #  bring the grand total for likely maximum length to 27 with a minimum
    #  of 8 bytes("1:1:4:1\n").
    #
    #  So 32 byte read should be more than sufficient.
    #
    # * If 64 bit ints get involved, there are other issues to address

    parsed = ChunkList()
    while True:
        blob = handle.read(32)

        # Consume any unnecessary newlines in front of chunks
        blob = blob.lstrip(b'\n')

        if not blob:
            break

        if len(blob) < 8:
            raise ParseError("Incomplete chunk file? Could only read %d "
                             "bytes of header." % len(blob))

        eol = blob.find(b'\n')
        if eol < 8:
            raise ParseError('Impossibly short chunk header: "%s"' % eol)
        header = get_header(blob, eol)

        if header.count(':') != 3:
            raise ParseError('Incorrect number of fields in chunk header: '
                             '"%s"' % header)

        add_sub, chunk_num, hash_len, read_len = header.split(':', 3)

        if len(add_sub) != 1:
            raise ParseError('Chunk type is too long: "%s"' % header)
        if add_sub not in ('a', 's'):
            raise ParseError('Invalid chunk type: "%s"' % header)

        try:
            chunk_num = int(chunk_num)
            hash_len = int(hash_len)
            read_len = int(read_len)
        except ValueError:
            raise ParseError('Non-integer chunk values: "%s"' % header)

        if read_len % hash_len != 0:
            raise ParseError('Chunk data length not a multiple of prefix '
                             'size: "%s"' % header)

        blob = blob[eol + 1:]
        blob += handle.read(read_len - len(blob))
        if blob is None or len(blob) < read_len:
            raise ParseError('Chunk data truncated for chunk %d' % chunk_num)

        hashes = []
        pos = 0
        while pos < read_len:
            hashes.append(blob[pos:pos + hash_len])
            pos += hash_len

        parsed.insert_chunk(
            Chunk(chunk_type=add_sub, number=chunk_num, hashes=hashes))

    return parsed
Esempio n. 7
0
 def test_load(self):
     f = FileSource("file://" + self.source.name, 1)
     f.load()
     self.assertEqual(
         f.chunks, ChunkList(add_chunks=simple_adds,
                             sub_chunks=simple_subs))
    except ValueError, e:
        raise ParseError("Could not parse index file: %s" % e)

    if 'name' not in index:
        raise ParseError("Incorrectly formatted index: missing list name")

    if 'chunks' not in index:
        raise ParseError("Incorrectly formatted index: missing chunks")

    if 'basedir' in index:
        basedir = posixpath.join(os.path.dirname(handle.name),
                                 index['basedir'])
    else:
        basedir = os.path.dirname(handle.name)

    parsed = ChunkList()
    for key in index['chunks'].iterkeys():
        # A little massaging to make the data structure a little cleaner
        try:
            index['chunks'][int(key)] = index['chunks'][key]
            del index['chunks'][key]
        except KeyError:
            raise ParseError("Some weird behaviour with the list of chunks "
                             "in \"%s\"" % handle.filename)

        chunk_file = posixpath.join(basedir, key)

        if not exists_cb(chunk_file):
            raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file)

        with open_cb(chunk_file, 'rb') as f: