def test_parse_file_source(self): d = b''.join([self.hm, self.hg]) add = b"a:17:32:%d\n%s" % (len(d), d) sub = b"s:18:32:%d\n%s" % (len(d), d) adds = [ Chunk(chunk_type='a', number=17, hashes=set([self.hg, self.hm]), hash_size=32) ] subs = [ Chunk(chunk_type='s', number=18, hashes=set([self.hg, self.hm]), hash_size=32) ] self.assertEqual(parse_file_source(io.BytesIO(add)), ChunkList(add_chunks=adds)) self.assertEqual(parse_file_source(io.BytesIO(sub)), ChunkList(sub_chunks=subs)) # Both adds and subs with a spurious newline in between both = b"%s\n%s" % (add, sub) self.assertEqual(parse_file_source(io.BytesIO(both)), ChunkList(add_chunks=adds, sub_chunks=subs))
def __init__(self, source_url, refresh_interval): self.source_url = source_url self.url = urlparse(self.source_url) self.interval = int(refresh_interval) self.last_refresh = 0 self.last_check = 0 # Initialize with an empty data set so we can always continue to serve self.chunks = ChunkList() self.chunk_index = {'adds': set(()), 'subs': set(())} self.prefixes = None self.no_data = True
class Source(object): """ Base class for data sources """ def __init__(self, source_url, refresh_interval): self.source_url = source_url self.url = urlparse(self.source_url) self.interval = int(refresh_interval) self.last_refresh = 0 self.last_check = 0 # Initialize with an empty data set so we can always continue to serve self.chunks = ChunkList() self.chunk_index = {'adds': set(()), 'subs': set(())} self.prefixes = None self.no_data = True def load(self): raise NotImplementedError def _populate_chunks(self, fp, parser_func, *args, **kwargs): try: self.chunks = parser_func(fp, *args, **kwargs) self.last_check = int(time.time()) self.last_refresh = int(time.time()) self.chunk_index = {'adds': set(self.chunks.adds.keys()), 'subs': set(self.chunks.subs.keys())} except ParseError as e: raise ParseError('Error parsing "%s": %s' % (self.url.path, e)) def refresh(self): # Prevent constant refresh checks now = int(time.time()) if now - self.interval >= self.last_check: self.last_check = now if self.needs_refresh(): self.load() return False def needs_refresh(self): return False def fetch(self, adds, subs): self.refresh() chunks = {'adds': [], 'subs': []} for chunk_num in adds: chunks['adds'].append(self.chunks.adds[chunk_num]) for chunk_num in subs: chunks['subs'].append(self.chunks.subs[chunk_num]) return chunks def list_chunks(self): self.refresh() return (self.chunk_index['adds'], self.chunk_index['subs']) def find_prefix(self, prefix): return self.chunks.find_prefix(prefix)
def test_load(self): with mock_s3(): conn = boto.connect_s3() b = conn.create_bucket(self.bucket_name) k = Key(b) k.name = self.key_name k.set_contents_from_string(self.add + b'\n' + self.sub) f = S3FileSource("s3+file://{0}/{1}".format(self.bucket_name, self.key_name), 0.5) f.load() self.assertEqual(f.chunks, ChunkList(add_chunks=simple_adds, sub_chunks=simple_subs))
_d = b''.join([hm, hg]) add = b"a:17:32:%d\n%s" % (len(_d), _d) sub = b"s:18:32:%d\n%s" % (len(_d), _d) def setUp(self): self.maxDiff = None super(ShavarTestCase, self).setUp() def get_configurator(self): config = super(ShavarTestCase, self).get_configurator() config.include("shavar") return config def chunkit(n, typ, *urls): return Chunk(number=n, chunk_type=typ, hashes=[hashlib.sha256(u.encode()).digest() for u in urls]) DELTA_RESULT = ChunkList(add_chunks=[ chunkit(1, 'a', 'https://www.mozilla.org/', 'https://www.google.com/'), chunkit(2, 'a', 'https://github.com/', 'http://www.python.org/'), chunkit(4, 'a', 'http://www.haskell.org/', 'https://www.mozilla.com/'), chunkit(5, 'a', 'http://www.erlang.org', 'http://golang.org/') ], sub_chunks=[ chunkit(3, 's', 'https://github.com/'), chunkit(6, 's', 'http://golang.org') ])
def parse_file_source(handle): """ Parses a chunk list formatted file """ # We should almost certainly* find the end of the first newline within the # first 32 bytes of the file. It consists of a colon delimited string # with the following members: # # - type of chunk: 'a' or 's' == 1 # - chunk number: assuming len(2**32) == max of 10 # - number of bytes in the hash prefix size: 4 bytes for shavar or # 32 digest256 == max of 2 # - length of the raw data following in octets: len(2**32) == max of 10 # # These total 23 plus 3 bytes for colons plus one byte for the newline # bring the grand total for likely maximum length to 27 with a minimum # of 8 bytes("1:1:4:1\n"). # # So 32 byte read should be more than sufficient. # # * If 64 bit ints get involved, there are other issues to address parsed = ChunkList() while True: blob = handle.read(32) # Consume any unnecessary newlines in front of chunks blob = blob.lstrip('\n') if not blob: break if len(blob) < 8: raise ParseError("Incomplete chunk file? Could only read %d " "bytes of header." % len(blob)) eol = blob.find('\n') if eol < 8: raise ParseError('Impossibly short chunk header: "%s"' % eol) header = blob[:eol] if header.count(':') != 3: raise ParseError('Incorrect number of fields in chunk header: ' '"%s"' % header) add_sub, chunk_num, hash_len, read_len = header.split(':', 3) if len(add_sub) != 1: raise ParseError('Chunk type is too long: "%s"' % header) if add_sub not in ('a', 's'): raise ParseError('Invalid chunk type: "%s"' % header) try: chunk_num = int(chunk_num) hash_len = int(hash_len) read_len = int(read_len) except ValueError: raise ParseError('Non-integer chunk values: "%s"' % header) if read_len % hash_len != 0: raise ParseError('Chunk data length not a multiple of prefix ' 'size: "%s"' % header) blob = blob[eol + 1:] blob += handle.read(read_len - len(blob)) if blob is None or len(blob) < read_len: raise ParseError('Chunk data truncated for chunk %d' % chunk_num) hashes = [] pos = 0 while pos < read_len: hashes.append(blob[pos:pos + hash_len]) pos += hash_len parsed.insert_chunk(Chunk(chunk_type=add_sub, number=chunk_num, hashes=hashes)) return parsed
except ValueError, e: raise ParseError("Could not parse index file: %s" % e) if 'name' not in index: raise ParseError("Incorrectly formatted index: missing list name") if 'chunks' not in index: raise ParseError("Incorrectly formatted index: missing chunks") if 'basedir' in index: basedir = posixpath.join(os.path.dirname(handle.name), index['basedir']) else: basedir = os.path.dirname(handle.name) parsed = ChunkList() for key in index['chunks'].iterkeys(): # A little massaging to make the data structure a little cleaner try: index['chunks'][int(key)] = index['chunks'][key] del index['chunks'][key] except KeyError: raise ParseError("Some weird behaviour with the list of chunks " "in \"%s\"" % handle.filename) chunk_file = posixpath.join(basedir, key) if not exists_cb(chunk_file): raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file) with open_cb(chunk_file, 'rb') as f:
def parse_dir_source(handle, exists_cb=os.path.exists, open_cb=open): """ Expects a file alike object with the contents of a JSON formatted index file that has the following structure: { "name": "mozpub-tracking-digest256", "basedir": "mozpub-tracking-digest256", "chunks": { "1": { "path": "mozpub-tracking-digest256/1", "hashes(optional)": [ "", "" ], "prefixes(optional)": [ "", "" ] }, "2": { "path": "mozpub-tracking-digest256/2", "hashes": [ "", "" ], "prefixes": [ "", "" ] } } } The basedir, hashes, and prefixes entries are optional. The chunks to be served will be parsed with parse_file_source(). If hashes and prefixes are provided, they will be verified against the data provided in the given chunk file. """ try: index = json.load(handle) except ValueError as e: raise ParseError("Could not parse index file: %s" % e) if 'name' not in index: raise ParseError("Incorrectly formatted index: missing list name") if 'chunks' not in index: raise ParseError("Incorrectly formatted index: missing chunks") if 'basedir' in index: basedir = posixpath.join(os.path.dirname(handle.name), index['basedir']) else: handle_name = handle.name if isinstance(handle_name, int): handle_name = str(handle_name) basedir = os.path.dirname(handle_name) parsed = ChunkList() int_key_chunks = {} for key in index['chunks'].keys(): # A little massaging to make the data structure a little cleaner try: int_key_chunks[int(key)] = index['chunks'][key] except KeyError: raise ParseError("Some weird behaviour with the list of chunks " "in \"%s\"" % handle.filename) chunk_file = posixpath.join(basedir, str(key)) if not exists_cb(chunk_file): raise ParseError("Invalid chunk filename: \"%s\"" % chunk_file) with open_cb(chunk_file, 'rb') as f: chunk_list = parse_file_source(f) # Only one chunk per file if len(chunk_list) > 1: raise ParseError("More than one chunk in chunk file \"%s\"" % chunk_file) for chunk in itertools.chain(iter(chunk_list.adds.values()), iter(chunk_list.subs.values())): parsed.insert_chunk(chunk) index['chunks'] = int_key_chunks return parsed
def parse_file_source(handle): """ Parses a chunk list formatted file """ # We should almost certainly* find the end of the first newline within the # first 32 bytes of the file. It consists of a colon delimited string # with the following members: # # - type of chunk: 'a' or 's' == 1 # - chunk number: assuming len(2**32) == max of 10 # - number of bytes in the hash prefix size: 4 bytes for shavar or # 32 digest256 == max of 2 # - length of the raw data following in octets: len(2**32) == max of 10 # # These total 23 plus 3 bytes for colons plus one byte for the newline # bring the grand total for likely maximum length to 27 with a minimum # of 8 bytes("1:1:4:1\n"). # # So 32 byte read should be more than sufficient. # # * If 64 bit ints get involved, there are other issues to address parsed = ChunkList() while True: blob = handle.read(32) # Consume any unnecessary newlines in front of chunks blob = blob.lstrip(b'\n') if not blob: break if len(blob) < 8: raise ParseError("Incomplete chunk file? Could only read %d " "bytes of header." % len(blob)) eol = blob.find(b'\n') if eol < 8: raise ParseError('Impossibly short chunk header: "%s"' % eol) header = get_header(blob, eol) if header.count(':') != 3: raise ParseError('Incorrect number of fields in chunk header: ' '"%s"' % header) add_sub, chunk_num, hash_len, read_len = header.split(':', 3) if len(add_sub) != 1: raise ParseError('Chunk type is too long: "%s"' % header) if add_sub not in ('a', 's'): raise ParseError('Invalid chunk type: "%s"' % header) try: chunk_num = int(chunk_num) hash_len = int(hash_len) read_len = int(read_len) except ValueError: raise ParseError('Non-integer chunk values: "%s"' % header) if read_len % hash_len != 0: raise ParseError('Chunk data length not a multiple of prefix ' 'size: "%s"' % header) blob = blob[eol + 1:] blob += handle.read(read_len - len(blob)) if blob is None or len(blob) < read_len: raise ParseError('Chunk data truncated for chunk %d' % chunk_num) hashes = [] pos = 0 while pos < read_len: hashes.append(blob[pos:pos + hash_len]) pos += hash_len parsed.insert_chunk( Chunk(chunk_type=add_sub, number=chunk_num, hashes=hashes)) return parsed
def test_load(self): f = FileSource("file://" + self.source.name, 1) f.load() self.assertEqual( f.chunks, ChunkList(add_chunks=simple_adds, sub_chunks=simple_subs))