def validate_feature_vectors(db, msg, java, extractor): """ Returns True if the features in msg can be validated using only the Mongo-stored artefact (ie. not kafka) and re-calculating. Otherwise False Slow and expensive, so only called when --defensive is specified. Extensive checking to validate that key metrics are exactly what they should be so that data quality across the pipeline is high. """ assert isinstance(msg, dict) assert 'url' in msg assert os.path.exists(java) assert os.path.exists(extractor) jsr = JavascriptArtefact(url=msg['url'], sha256=msg['sha256'], md5=msg['md5'], size_bytes=msg['size_bytes']) ret = db.scripts.find_one({'_id': ObjectId(msg['js_id'])}) if ret is None: raise ValueError("Could not locate script: {}".format(jsr.url)) features, failed, stderr = analyse_script(ret.get('code'), jsr, java=java, feature_extractor=extractor) if failed: return False l = len(features['literals_by_count']) o = len(msg['literals_by_count']) print(l, " ", o) assert o == l assert features['statements_by_count'] == msg['statements_by_count'] assert features['calls_by_count'] == msg['calls_by_count'] return True
def iterate(consumer, max, cache, verbose=False): assert consumer is not None assert cache is not None assert max > 0 # NB: sort each batch in order to maximise performance of fv_cache (sha256 should be sufficient) batch_size = 2000 for batch_of_messages in batch(next_artefact(consumer, max, javascript_only(), verbose=verbose), n=batch_size): if len(batch_of_messages) < batch_size: print("WARNING: expected {} messages, got {}".format( batch_size, len(batch_of_messages))) n_cached = 0 for r in sorted(batch_of_messages, key=lambda v: v['sha256']): jsr = JavascriptArtefact(**r) if verbose: print(jsr) if not jsr.sha256 in cache: cache[jsr.sha256] = (jsr.md5, jsr.size_bytes) yield jsr elif cache[jsr.sha256][0] == jsr.md5 and cache[ jsr.sha256][1] == jsr.size_bytes: n_cached += 1 else: # sha256 collison where md5 and/or size_bytes was not what was expected yield jsr print("Processed message batch: n={} cached={}".format( len(batch_of_messages), n_cached)) if not isinstance( consumer, list ): # HACK: list is used for testing iterate() and it has no commit... consumer.commit( ) # blocking call for now to ensure we only update the consumer offset at the end of a batch (this will cause dupes, but thats better than loss)
def test_analyse_script_utf8_handling(pytestconfig, analyse_utf8_expected_results): testjs = "{}/src/test-javascript/ca.js".format(pytestconfig.rootdir) with open(testjs, 'rb') as fp: jsr = JavascriptArtefact(url="file:{}".format(testjs), origin=None, sha256="XXX", md5="XXX") byte_content, failed, stderr = analyse_script( fp.read(), jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert not failed js = json.loads(byte_content.decode()) # must match results computed by java on the CLI... v1 = truncate_literals(js['literals_by_count']) v2 = truncate_literals( analyse_utf8_expected_results['literals_by_count']) assert len(v1) == len(v2) assert v1 == v2 # and that calculate_literal_ distance() is also zero (ie. unicode handling is robust) dist, n_not_in_origin, n_not_in_control, diff_lits = calculate_literal_distance( v1, v2, fail_if_difference=True) assert pytest.approx(dist, 0.0) assert n_not_in_origin == 0 assert n_not_in_control == 0 assert diff_lits == []
def test_report_failure(): producer = mock.Mock() artefact = JavascriptArtefact(url='XXX', content_type='text/javascript', size_bytes=33, sha256='affb', md5='affc') report_failure(producer, artefact, 'reason=hello world') assert len(producer.method_calls) == 1 name, args, kwargs = producer.method_calls[0] assert name == 'send' assert len(args) == 2 assert args[0] == 'feature-extraction-failures' d = args[1].copy() assert 'when' in d assert isinstance( d['when'], str) and len(d['when']) > 10 # FIXME... stronger test... ? d.pop('when', None) expected_results = { 'url': 'XXX', 'sha256': 'affb', 'md5': 'affc', 'inline': False, 'content_type': 'text/javascript', 'size_bytes': 33, 'origin': None, 'reason': 'reason=hello world', 'js_id': '' } assert d == expected_results
def test_analyse_script_2(pytestconfig): testjs = "{}/src/test-javascript/fieldRequiredWhenNotAfterGoLiveValidation.js".format( pytestconfig.rootdir) with open(testjs, "rb") as fp: jsr = JavascriptArtefact(url="file:{}".format(testjs), origin=None, sha256="XXX", md5="XXX") byte_content, failed, stderr = analyse_script( fp.read(), jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert not failed js = json.loads(byte_content.decode()) assert js['statements_by_count'] == { "FunctionNode": 2, "StringLiteral": 13, "VariableInitializer": 3, "Scope": 1, "KeywordLiteral": 3, "AstRoot": 1, "Assignment": 2, "IfStatement": 1, "Block": 2, "InfixExpression": 10, "ExpressionStatement": 4, "PropertyGet": 14, "ReturnStatement": 2, "UnaryExpression": 1, "Name": 37, "NumberLiteral": 2, "ArrayLiteral": 1, "VariableDeclaration": 3, "FunctionCall": 9, "ElementGet": 2, "ParenthesizedExpression": 3 } assert js['calls_by_count'] == { "val": 1, "F$": 3, "addMethod": 1, "get": 1, "attr": 1, "split": 1, "add": 1 } assert js['literals_by_count'] == { " ": 1, "0": 2, "#IsAfterGoLive": 1, "INPUT": 1, "requiredwhennotaftergolivevalidation": 1, "True": 1, "class": 1, "testrequiredwhennotaftergolivevalidation": 3, "SELECT": 1 }
def calc_vector(filename): with open(filename, 'rb') as fp: jsr = JavascriptArtefact(url="file:{}".format(filename), sha256="XXX", md5="XXX") ret, failed, stderr = analyse_script(fp.read(), jsr, feature_extractor=args.extractor) if failed: raise Exception(stderr) return json.loads(ret.decode('utf-8'))
def find_or_update_analysis_content(db, m, fail_iff_not_found=False, defensive=False, java=None, extractor=None, force=False): assert isinstance(m, dict) assert all([ 'js_id' in m, 'url' in m, 'sha256' in m, 'md5' in m, 'size_bytes' in m ]) js_id = m.get('js_id') assert len(js_id) > 0 # NB: due to an error in processing, I had to throw away the db.analysis_content collection, so records may be missing. Sigh 8th June 2020 if not force: byte_content_doc = db.analysis_content.find_one({'js_id': js_id}) if fail_iff_not_found and byte_content_doc is None: # prevent infinite recursion raise ValueError("No data for {}".format(js_id)) elif byte_content_doc is None: print("WARNING: unable to locate analysis content for {}".format( js_id)) else: byte_content_doc = None if byte_content_doc is None: code_bytes, js_id = get_script(db, js_id) assert code_bytes is not None jsr = JavascriptArtefact(url=m.get('url'), sha256=m.get('sha256'), md5=m.get('md5'), size_bytes=m.get('size_bytes'), js_id=js_id) vector_as_bytes, failed, stderr = analyse_script( code_bytes, jsr, java=java, feature_extractor=extractor) if failed: raise ValueError("Could not analyse artefact: js_id={}\n{}".format( js_id, stderr)) save_analysis_content(db, jsr, vector_as_bytes) if defensive: # check that artefact hashes match the actual content assert hashlib.sha256(code_bytes).hexdigest() == m.get('sha256') return find_or_update_analysis_content( db, m, fail_iff_not_found=True) # this time it should be found! assert 'analysis_bytes' in byte_content_doc byte_content = byte_content_doc.get('analysis_bytes') assert isinstance(byte_content, bytes) return json.loads(byte_content.decode())
def test_javascript_artefact(): # usual constructor parameters for most apps jsr1 = JavascriptArtefact(sha256="XYZ", md5="XXX", size_bytes=279, url="http://foo/bar/baz") assert jsr1.sha256 == "XYZ" assert jsr1.md5 == "XXX" assert jsr1.size_bytes == 279 assert jsr1.url == "http://foo/bar/baz" assert jsr1.origin is None # if origin is not specified it must be None assert jsr1.inline == False assert jsr1.content_type == "text/javascript" assert isinstance(jsr1.when, str) and len(jsr1.when) > 10 # but etl_upload requires compatibility with 'checksum' instead of md5 jsr2 = JavascriptArtefact(sha256="JJJ", checksum="XXX", size_bytes=122999, url=None, origin="http://bar/baz") assert jsr2.md5 == "XXX" assert jsr2.size_bytes == 122999 assert jsr2.origin == "http://bar/baz" # and that sorting is correctly done via the md5 field j3 = JavascriptArtefact(md5='333', url=None, sha256='') j9 = JavascriptArtefact(md5='999', url=None, sha256='') j1 = JavascriptArtefact(md5='111', url=None, sha256='') l = [j3, j9, j1] assert sorted(l) == [j1, j3, j9]
def test_analyse_script_failure(pytestconfig): # mozilla rhino cant handle all JS... so check that failure path is as expected testjs = "{}/src/test-javascript/google-analytics.js".format( pytestconfig.rootdir) with open(testjs, "rb") as fp: jsr = JavascriptArtefact(url="file:{}".format(testjs), origin=None, sha256="XXX", md5="XXX") json, failed, stderr = analyse_script( fp.read(), jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert failed assert "missing ; after for-loop initializer" in stderr
def test_analyse_script(pytestconfig, analyse_script_expected_results): testjs = "{}/src/test-javascript/banners.js".format(pytestconfig.rootdir) with open(testjs, "rb") as fp: jsr = JavascriptArtefact(url="file:{}".format(testjs), origin=None, sha256='XXX', md5='XXX') byte_content, failed, stderr = analyse_script( fp.read(), jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert not failed js = json.loads(byte_content.decode()) js.pop('id', None) # not interested in this field anymore -- obsolete assert js == analyse_script_expected_results
def resolve_feature_vector(db, message): url = message.get('id') d = {'first_url': url} # 1. find url _id ret = db.urls.find_one( {'url': url} ) # TODO FIXME: do we care which timestamp of the url we get? nah... not for now if not ret: raise Exception("Could not find URL in MongoDB: {}".format(url)) id = ret.get('_id') # 2. lookup url_id in script_urls collection ret = db.script_url.find_one({'url_id': id}) if not ret: raise Exception("Could not find script_url in MongoDB: {} {}".format( id, url)) script_id = ret.get('script') # 3. lookup script_id in scripts collection ret = db.scripts.find_one({'_id': script_id}) if ret: d.update({ 'sha256': ret.get('sha256'), 'md5': ret.get('md5'), 'size_bytes': ret.get('size_bytes') }) else: raise Exception("Could not find script {} {}".format(script_id, url)) code = ret.get('code') # 4. finally we want to avoid re-computing the feature vector (slow) so we look it up in Mongo ret = db.statements_by_count.find_one({'url': url}) if ret: d.update(**ret) d.pop('_id', None) d.pop('url', None) else: sha256 = message.get('sha256') md5 = message.get('md5') jsr = JavascriptArtefact(url=url, sha256=sha256, md5=md5) ret, failed, stderr = analyse_script(code, jsr) if not failed: d.update(**ret['statements_by_count']) else: raise Exception(stderr) return d
def strategy_1_pyrequests(db, producer, artefact_url, cited_on, **kwargs): assert db is not None assert producer is not None assert len(artefact_url) > 0 assert len(cited_on) > 0 ua = kwargs.get('ua', None) referrer = kwargs.get('referrer', kwargs.get('referer', None)) # one r or two.. your choice ;-) headers = {} if ua is not None: assert len(ua) > 0 headers.update({'User-Agent': ua}) if referrer is not None: assert len(referrer) > 0 headers.update({'Referer': referrer}) try: resp = requests.get(artefact_url, headers=headers, verify=False) if resp.status_code == 200: content = resp.content sha256 = hashlib.sha256(content).hexdigest() artefact = JavascriptArtefact(url=artefact_url, sha256=sha256, md5=hashlib.md5(content).hexdigest(), size_bytes=len(content), origin=cited_on) # NB: ignore return result from save_artefact() ret, was_cached = save_artefact( db, artefact, None, content=content) # will save to Mongo AND Kafka visited topic producer.send(kwargs.get('to'), ret, key=sha256.encode('utf-8')) return artefact else: return None except Exception as e: print( "Failed to download using python requests: {} (exception follows)". format(artefact_url)) print(str(e)) return None
def test_find_best_control(pytestconfig): # def find_best_control(input_features, controls_to_search, max_distance=100.0, db=None, debug=False) js_file = "{}/src/test-javascript/json2_4.9.2.min.js".format( pytestconfig.rootdir) with open(js_file, 'rb') as fp: content = fp.read() jsr = JavascriptArtefact(url=js_file, sha256=hashlib.sha256(content).hexdigest(), md5=hashlib.md5(content).hexdigest(), size_bytes=len(content)) input_features, failed, stderr = analyse_script( content, jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert not failed db = mock.Mock() db.javascript_controls.find_one.return_value = { 'literals_by_count': { 'blah': 0, 'blah': 0 } } d = json.loads(input_features.decode()) d['js_id'] = 'XXXXXXXXXXXXXXXXXXXXXXXX' d['sha256'] = 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF' best_control, next_best_control = find_best_control(db, d, debug=True) # EXPECTED RESULTS: # best_control = BestControl(control_url='https://cdn.jsdelivr.net/gh/WordPress/[email protected]//wp-includes/js/json2.min.js', origin_url='/home/acas/src/pi-cluster-ansible-cfg-mgmt/src/test-javascript/json2_4.9.2.min.js', sha256_matched=False, ast_dist=0.0, function_dist=0.0, diff_functions='', cited_on=None) # next_best_control has a larger ast_dist due to a different version, but it has the same function calls so the dist is zero print(best_control) print(next_best_control) # NB: with only 1 control in the test, next_best must have infinite distances assert next_best_control.ast_dist == pytest.approx(7.54983) assert next_best_control.function_dist == pytest.approx(0.0) assert best_control.ast_dist <= 0.0000001 assert best_control.function_dist <= 0.000001 assert pytest.approx(best_control.literal_dist, -1.0) assert pytest.approx(next_best_control.literal_dist, -1.0)
def test_analyse_script_unpacker(pytestconfig): expected_results = { 'packed1.js': True, 'packed2.js': True, 'packed3.js': True } for testjs in ["packed1.js", "packed2.js", "packed3.js"]: filename = "{}/src/test-javascript/{}".format(pytestconfig.rootdir, testjs) jsr = JavascriptArtefact(url="file:{}".format(filename), origin=None, sha256='XXX', md5='XXX') byte_content, failed, stderr = analyse_script( filename, jsr, feature_extractor="{}/src/extract-features.jar".format( pytestconfig.rootdir)) assert failed == expected_results[testjs] if not failed: assert byte_content[0] == ord('{')
rec = message.value if args.v: print(rec) try: assert 'calls_by_count' in rec assert 'statements_by_count' in rec assert 'url' in rec and rec['url'] is not None assert 'origin' in rec and rec['origin'] is not None except AssertionError: print(rec) n_bad += 1 continue jsr = JavascriptArtefact(url=rec['url'], origin=rec['origin'], sha256=rec.get('sha256', ''), md5=rec.get('md5', ''), inline=rec.get('inline')) script, url_entry = find_script(db, jsr.url, want_code=False) if script is None or not '_id' in script: print(script) print(url_entry) n_bad += 1 continue js_id = script.get('_id') save_ast_vector(db, jsr, rec['statements_by_count'], js_id=js_id) save_call_vector(db, jsr, rec['calls_by_count'], js_id=js_id) n += 1 if n % 1000 == 0: print("Processed {} records.".format(n))
print("Bad data - skipping... {}".format(hit)) continue assert 'origin_js_id' in hit js_id = hit.get('origin_js_id') if js_id is None: print("Bad data - no origin_js_id... skipping".format(hit)) continue ret = db.scripts.find_one({ '_id': ObjectId(js_id) }) if ret is None: # should not happen... but if it does... print("Unable to locate {} is db.scripts... skipping".format(js_id)) continue content = ret.get('code') jsr = JavascriptArtefact(url=hit.get('origin_url'), sha256=hashlib.sha256(content).hexdigest(), md5=hashlib.md5(content).hexdigest(), inline=False) m, failed, stderr = analyse_script(content, jsr, java=args.java, feature_extractor=args.extractor) if failed: n_failed += 1 continue m.update({ 'origin': hit.get('cited_on'), 'js_id': js_id }) assert 'js_id' in m and len(m['js_id']) > 0 # PRE-CONDITION: ensure hits have origin_js_id field set best_control, next_best_control = find_best_control(m, all_controls, db=db) d = asdict(best_control) # NB: all fields of the model are sent to output kafka topic and Mongo # 2a. also send results to MongoDB for batch-oriented applications and for long-term storage # POST-CONDITIONS which MUST be maintained are checked before pushing to topic assert 'cited_on' in d and len(d['cited_on']) > 0 assert 'origin_url' in d and len(d['origin_url']) > 0 assert isinstance(d['origin_js_id'], str) or d['origin_js_id'] is None
mongo.close() except NameError: pass # NameError occurs when using --file as consumer has not been setup since it is not required rm_pidfile('pid.eval.controls', root='.') sys.exit(0) if args.v: print(db.javascript_controls.distinct('family')) if args.file: with open(args.file, 'rb') as fp: content = fp.read() jsr = JavascriptArtefact(url=args.file, sha256=hashlib.sha256(content).hexdigest(), js_id='0' * 24, md5=hashlib.md5(content).hexdigest(), size_bytes=len(content)) byte_content, failed, stderr = analyse_script( content, jsr, feature_extractor=args.extractor) if failed: raise ValueError("Failed to analyse script: {}\n{}".format( jsr, stderr)) m = json.loads(byte_content.decode()) m.update(asdict(jsr)) print(m) best_control, next_best_control = find_best_control( db, m, max_distance=args.max_distance, debug=True) update_literal_distance(db, best_control, m['literals_by_count'],
def save_control(db, url, family, variant, version, force=False, refuse_hashes=None, provider='', java='/usr/bin/java', feature_extractor=None, content=None): """ Update all control related data. Note callers must supply refuse_hashes (empty set) or an error will result Returns JavascriptArtefact representing control which has had its state updated into MongoDB """ assert url is not None assert family is not None assert version is not None if content is None: resp = requests.get(url) if resp.status_code != 200: raise ValueError("Failed to fetch [{}] {}".format( resp.status_code, url)) content = resp.content sha256 = hashlib.sha256(content).hexdigest() md5 = hashlib.md5(content).hexdigest() jsr = JavascriptArtefact(when=str(datetime.utcnow()), sha256=sha256, md5=md5, url=url, inline=False, content_type='text/javascript', size_bytes=len(content)) if jsr.size_bytes < 1000: print( "Refusing artefact as too small to enable meaningful vector comparison: {}" .format(jsr)) return jsr if not force and jsr.sha256 in refuse_hashes: print("Refusing to update existing control as dupe: {}".format(jsr)) return jsr bytes_content, failed, stderr = analyse_script( content, jsr, java=java, feature_extractor=feature_extractor) if failed: raise ValueError('Could not analyse script {} - {}'.format( jsr.url, stderr)) ret = json.loads(bytes_content.decode()) cntrl_url, subfamily = identify_control_subfamily(jsr.url) ret.update({ 'family': family, 'release': version, 'variant': variant, 'origin': url, 'sha256': sha256, 'md5': md5, 'size_bytes': len(content), 'do_not_load': False, # all controls loaded by default except alpha/beta/release candidate 'provider': provider, 'subfamily': subfamily }) #print(ret) assert 'sha256' in ret assert 'md5' in ret assert 'size_bytes' in ret # NB: only one control per url/family pair (although in theory each CDN url is enough on its own) resp = db.javascript_controls.find_one_and_update( { 'origin': url, 'family': family }, {"$set": ret}, upsert=True) db.javascript_control_code.find_one_and_update({'origin': url}, { "$set": { 'origin': url, 'code': Binary(content), 'analysis_bytes': bytes_content, "last_updated": jsr.when } }, upsert=True) update_control_summary(db, url, ret['statements_by_count'], ret['calls_by_count'], ret['literals_by_count']) return jsr
def report_vectors(db, artefact_fname, control_url: str, artefact_url: str): assert len(control_url) > 0 and len(artefact_url) > 0 assert os.path.exists(artefact_fname) cntrl = db.javascript_controls.find_one({'origin': control_url}) assert cntrl is not None assert 'literals_by_count' in cntrl assert 'statements_by_count' in cntrl assert 'calls_by_count' in cntrl # we must analyse the artefact to get the vectors for the artefact (since its too expensive to search kafka for it) jsr = JavascriptArtefact(url=artefact_url, sha256='XXX', md5='XXX', inline=False) byte_content, failed, stderr = analyse_script(artefact_fname, jsr) if failed: raise ValueError("Unable to analyse script: {}\n{}".format( artefact_url, stderr)) ret = json.loads(byte_content.decode()) assert 'literals_by_count' in ret assert 'statements_by_count' in ret assert 'calls_by_count' in ret # ok, now we have the vectors, lets report the comparison between control and artefact... v1, ast1_sum = calculate_ast_vector(cntrl['statements_by_count']) v2, ast2_sum = calculate_ast_vector(ret['statements_by_count']) print("Control url is: {}".format(control_url)) print("Artefact url is: {}".format(artefact_url)) print("AST vector magnitudes: control={} artefact={}".format( ast1_sum, ast2_sum)) print(v1) print(v2) dist = compute_distance(v1, v2) print("AST distance: {:.2f}".format(dist)) diff_features = [] for feature_idx, feature in enumerate(ast_feature_list): if v1[feature_idx] != v2[feature_idx]: diff_features.append( (feature, abs(v1[feature_idx] - v2[feature_idx]))) items = [ '{} ({})'.format(t[0], t[1]) for t in sorted(diff_features, key=lambda t: t[1]) ] print("AST features which are different: ", ','.join(items)) diffs = [] all_calls = set(cntrl['calls_by_count'].keys()).union( ret['calls_by_count'].keys()) for fn in all_calls: cntl_cnt = cntrl['calls_by_count'].get(fn, 0) artefact_cnt = ret['calls_by_count'].get(fn, 0) if cntl_cnt != artefact_cnt: diffs.append(fn) v1, fn1_sum = calculate_vector(cntrl['calls_by_count'], feature_names=all_calls) v2, fn2_sum = calculate_vector(ret['calls_by_count'], feature_names=all_calls) print("Function call magnitudes: control={} artefact={}".format( fn1_sum, fn2_sum)) print(v1) print(v2) dist = compute_distance(v1, v2) print("Function call distance: {:.2f}".format(dist)) if len(diffs) == 0: print("All functions called the expected number of times.") else: print("Functions not called the expected number of times: {}".format( ' '.join(diffs))) t = calculate_literal_distance( truncate_literals(cntrl['literals_by_count']), truncate_literals(ret['literals_by_count'])) literal_dist, n_not_in_origin, n_not_in_control, diff_literals = t print("Literal distance is: {}".format(literal_dist)) print("Number of literals in control but not origin: {}".format( n_not_in_origin)) print("Number of literals in origin but not control: {}".format( n_not_in_control)) print("Diff literals: {}".format(diff_literals))
if __name__ == "__main__": args = a.parse_args() mongo = pymongo.MongoClient(args.db, args.port, username=args.dbuser, password=str(args.dbpassword)) db = mongo[args.dbname] cursor = db.javascript_control_code.find({}, no_cursor_timeout=True) # long-running find so we try to avoid it being killed prematurely... with cursor: for rec in cursor: assert 'code' in rec control_url = rec.get('origin') assert control_url.startswith("http") if args.v: print(control_url) # recalculate vector? if args.recalc: jsr = JavascriptArtefact(url=control_url, sha256='XXX', md5='YYY', inline=False) # only url matters for analyse script vectors, failed, stderr = analyse_script(rec.get('code'), jsr, java=args.java, feature_extractor=args.extractor) assert not failed assert isinstance(vectors, bytes) print(jsr.url) required_hash = hashlib.sha256(vectors).hexdigest() db.javascript_control_code.update_one({ '_id': rec.get('_id') }, { "$set": { "analysis_bytes": Binary(vectors) } }) else: vectors = rec.get('analysis_bytes') assert vectors is not None assert isinstance(vectors, bytes) d = json.loads(vectors) update_control_summary(db, control_url, d['statements_by_count'], d['calls_by_count'], d['literals_by_count']) db.javascript_controls.update_one({ 'origin': control_url }, { '$set': {