def update_literal_distance(db, hit: BestControl, ovec, fail_if_difference=False): assert hit is not None assert ovec is not None control_literal_vector = lookup_control_literals( db, hit.control_url, debug=fail_if_difference) if len(hit.control_url) > 0 else None if control_literal_vector is None: hit.literal_dist = -1.0 hit.literals_not_in_origin = -1 hit.literals_not_in_control = -1 hit.n_diff_literals = -1 hit.diff_literals = '' return v1 = truncate_literals(control_literal_vector) v2 = truncate_literals(ovec) try: t = calculate_literal_distance(v1, v2, fail_if_difference=fail_if_difference) hit.literal_dist, hit.literals_not_in_origin, hit.literals_not_in_control, diff_literals = t hit.n_diff_literals = len(diff_literals) hit.diff_literals = fix_literals(diff_literals) except ValueError as ve: raise ve
def test_best_control_infinity_handling(): bc1 = BestControl(literal_dist=float('Inf'), ast_dist=float('Inf'), function_dist=float('Inf'), control_url='', origin_url='', sha256_matched=False, diff_functions='') bc2 = BestControl(literal_dist=0.0, ast_dist=0.0, function_dist=0.0, control_url='XXX', origin_url='YYY', sha256_matched=True, diff_functions='') assert bc2.is_better(bc1) assert not bc1.is_better(bc2)
def test_best_control_max_distance(): bc2 = BestControl(literal_dist=7.0, ast_dist=12.0, function_dist=3.0, diff_functions=' ', # need 12 to disable original criteria control_url='XXX', origin_url='YYY', sha256_matched=True) ok, reason = bc2.good_hit_as_tuple(100.0) # since distprod is (12+3) * (3+7) == 150 assert ok and reason == 'good_two_smallest_distances' # must fail distance() test assert bc2.good_hit_as_tuple(200.0) == (True, 'dist_lt_200.0') bc2.function_dist=22 assert not bc2.is_good_hit()
def process_hit(db, all_controls, hit: BestControl, producer, reason, stats=None): assert stats is not None dist = hit.ast_dist assert dist >= 0.0 assert len(reason) > 0 origin_fields = as_url_fields(hit.origin_url, prefix='origin') host = origin_fields.get('origin_host') if host is None: return False d = asdict(hit) d.pop('diff_functions', None) u = hit.control_url assert u in all_controls # should have been checked before call assert len(u) > 0 # bad hits should not come to process_hit() d.update(origin_fields) # cited_on URL (aka. HTML page) iff specified d.update(as_url_fields(hit.cited_on, prefix='cited_on')) d['control_family'] = all_controls[u].get('family') # good hits get sent to the suspicious analysis pipeline if not reason in stats: stats[reason] = 0 stats[reason] += 1 dc = d.copy() dc.pop('_id', None) dc['diff_functions'] = hit.diff_functions_as_list() assert isinstance(dc['diff_functions'], list) if dc['cited_on_host'] is None: # some buggy records - rare print("Bad data - skipping... {}".format(dc)) return producer.send('etl-good-hits', dc) # sanity check: if different literals are found then the string must be greater than zero length assert (hit.n_diff_literals > 0 and len(dc.get('diff_literals')) > 0) or hit.n_diff_literals <= 0 db.etl_hits.insert_one( dc ) # BREAKING CHANGE: dc['diff_functions'] is now a list not a comma separated string, but literals is still a string
n_good = n_not_good = n = 0 save_pidfile('pid.etl.hits') all_controls = {} for t in load_controls(db, verbose=args.v): assert isinstance(t, tuple) assert len(t) == 3 assert isinstance(t[0], dict) assert 'origin' in t[0] all_controls[t[0].get('origin')] = t[0] stats = {} for r in consumer: n += 1 d = r.value d.pop('origin_vectors_sha256', None) hit = BestControl(**d) if args.v: if n % 10000 == 0: print("Processed {} records ({} not good). {}".format( n, n_not_good, str(datetime.utcnow()))) print(stats) if n > args.n: # TODO FIXME: will lose last message unless we exit before kafka auto commit occurs (but only if --n specified) break # 1. if xref is missing/empty reject the entire record if hit.xref is None or len(hit.xref) == 0: n_bad += 1 continue # 2. bad AST*function call product (over threshold)? Or not a hit? reject entire record
def iterate(consumer, max, verbose, threshold): for r in next_artefact(consumer, max, lambda v: v['ast_dist'] > threshold, verbose=verbose): yield BestControl(**r)
def find_best_control(db, input_features, max_distance=200.0, debug=False, control_cache=None): """ Search all controls with AST vector magnitudes within max_distance and find the best hit (lowest product of AST*call distance) against suitable controls. Does not currently use literal distance for the calculation. Could be improved.... returns up to two hits representing the best and next best hits (although the latter may be None). """ assert db is not None origin_url = input_features.get('url', input_features.get( 'id')) # LEGACY: url field used to be named id field cited_on = input_features.get( 'origin', None ) # report owning HTML page also if possible (useful for data analysis) origin_js_id = input_features.get( "js_id", None) # ensure we can find the script directly without URL lookup if isinstance(origin_js_id, tuple) or isinstance( origin_js_id, list ): # BUG FIXME: should not be a tuple but is... where is that coming from??? so... origin_js_id = origin_js_id[0] assert isinstance(origin_js_id, str) and len(origin_js_id) == 24 best_distance = float('Inf') input_ast_vector, ast_sum = calculate_ast_vector( input_features['statements_by_count']) # NB: UNweighted vector fcall_sum = sum(input_features['calls_by_count'].values()) best_control = BestControl(control_url='', origin_url=origin_url, cited_on=cited_on, sha256_matched=False, ast_dist=float('Inf'), function_dist=float('Inf'), literal_dist=0.0, diff_functions='', origin_js_id=origin_js_id) second_best_control = None # we open the distance to explore "near by" a little bit... but the scoring for these hits is unchanged if debug: print("find_best_control({})".format(origin_url)) plausible_controls = find_plausible_controls(db, ast_sum, fcall_sum, max_distance=max_distance) feasible_controls = find_feasible_controls(db, plausible_controls, debug=debug, control_cache=control_cache) for fc_tuple in feasible_controls: control, control_ast_sum, control_ast_vector, control_call_vector = fc_tuple # NB: unweighted ast vector assert isinstance(control, dict) assert control_ast_sum > 0 assert isinstance(control_ast_vector, list) control_url = control.get('origin') # compute what we can for now and if we can update it later we will. Otherwise the second_best control may have some fields not-computed new_distance, ast_dist, call_dist, diff_functions = distance( input_ast_vector, control_ast_vector, input_features['calls_by_count'], control_call_vector, debug=debug) if call_dist < 5.0 and new_distance > max_distance: print( "WARNING: rejecting possibly feasible control due to bad total distance: {} {} {} {} {}" .format(new_distance, ast_dist, call_dist, control_url, origin_url)) if new_distance < best_distance and new_distance <= max_distance: if debug: print("Got good distance {} for {} (was {}, max={})".format( new_distance, control_url, best_distance, max_distance)) new_control = BestControl( control_url= control_url, # control artefact from CDN (ground truth) origin_url=origin_url, # JS at spidered site origin_js_id=origin_js_id, cited_on=cited_on, sha256_matched=False, ast_dist=ast_dist, function_dist=call_dist, literal_dist=0.0, diff_functions=' '.join(diff_functions)) # NB: look at product of two distances before deciding to update best_* - hopefully this results in a lower false positive rate # (with accidental ast hits) as the number of controls in the database increases if best_control.is_better(new_control, max_distance=max_distance): second_dist = second_best_control.distance( ) if second_best_control is not None else 0.0 if second_best_control is None or second_dist > new_control.distance( ): if debug: print( "NOTE: improved second_best control was {} now is {}" .format(second_best_control, new_control)) second_best_control = new_control # NB: dont update best_* since we dont consider this hit a replacement for current best_control else: best_distance = new_distance second_best_control = best_control best_control = new_control if best_distance < 0.00001: # small distance means we can try for a hash match against control? assert control_url == best_control.control_url hash_match = ( control['sha256'] == input_features['sha256']) best_control.sha256_matched = hash_match break # save time since we've likely found the best control but this may mean next_best_control is not second best in rare cases else: if debug: print( "Rejecting control {} ast_dist={} fcall_dist={} total={}". format(control['origin'], ast_dist, call_dist, new_distance)) # NB: literal fields in best_control/next_best_control are updated elsewhere... not here return (best_control, second_best_control)
def test_process_hit_success(): db = mock.Mock() producer = mock.Mock() control_url = 'https://cdn.com/path/to/artefact.js' bc = BestControl(control_url=control_url, origin_url='https://some.web.site/somewhere.js', cited_on='https://some.web.site/somewhere.html', origin_js_id='12345', sha256_matched=True, diff_functions='', ast_dist=0.0, literal_dist=0.0, function_dist=0.0) all_controls = { control_url: { 'control_url': control_url, 'literals_by_count': { 'a': 1 } } } db.analysis_content.find_one.return_value = { 'analysis_bytes': b'{"calls_by_count":{}}' } process_hit(db, all_controls, bc, producer, 'distance_lt_200.0', stats={}) #expected_find_call = mock.call.analysis_content.find_one({'js_id': '12345' }) expected_insert_call = mock.call.etl_hits.insert_one({ 'control_url': 'https://cdn.com/path/to/artefact.js', 'origin_url': 'https://some.web.site/somewhere.js', 'sha256_matched': True, 'ast_dist': 0.0, 'function_dist': 0.0, 'cited_on': 'https://some.web.site/somewhere.html', 'literal_dist': 0.0, 'xref': None, 'literals_not_in_control': -1, 'literals_not_in_origin': -1, 'n_diff_literals': -1, 'diff_literals': '', 'origin_host': 'some.web.site', 'origin_has_query': False, 'origin_port': 443, 'origin_js_id': '12345', 'origin_scheme': 'https', 'origin_path': '/somewhere.js', 'cited_on_host': 'some.web.site', 'cited_on_has_query': False, 'cited_on_port': 443, 'cited_on_scheme': 'https', 'cited_on_path': '/somewhere.html', 'control_family': None, 'diff_functions': [] }) assert len(db.method_calls) == 1 assert db.method_calls == [expected_insert_call] assert producer.method_calls == [ mock.call.send( 'etl-good-hits', { 'control_url': 'https://cdn.com/path/to/artefact.js', 'origin_url': 'https://some.web.site/somewhere.js', 'sha256_matched': True, 'ast_dist': 0.0, 'function_dist': 0.0, 'cited_on': 'https://some.web.site/somewhere.html', 'origin_js_id': None, 'literal_dist': 0.0, 'xref': None, 'literals_not_in_control': -1, 'literals_not_in_origin': -1, 'n_diff_literals': -1, 'diff_literals': '', 'origin_js_id': '12345', 'origin_host': 'some.web.site', 'origin_has_query': False, 'origin_port': 443, 'origin_scheme': 'https', 'origin_path': '/somewhere.js', 'cited_on_host': 'some.web.site', 'cited_on_has_query': False, 'cited_on_port': 443, 'cited_on_scheme': 'https', 'cited_on_path': '/somewhere.html', 'control_family': None, 'diff_functions': [] }) ]
def test_best_control(): bc1 = BestControl(origin_url='XXX', control_url='YYY', literal_dist=0.0, ast_dist=1.0, function_dist=1.5, sha256_matched=False, diff_functions='parseJSON runAJAX') assert bc1.origin_url == 'XXX' assert bc1.control_url == 'YYY' assert bc1.ast_dist == pytest.approx(1.0) assert bc1.function_dist == pytest.approx(1.5) assert not bc1.sha256_matched assert bc1.diff_functions == 'parseJSON runAJAX' assert bc1.origin_js_id is None assert bc1.cited_on is None assert pytest.approx(bc1.literal_dist, -1.0) assert bc1.is_good_hit() bc2 = BestControl(origin_url='AAA', control_url='BBB', ast_dist=10.0, function_dist=1.5, literal_dist=0.0, sha256_matched=False, diff_functions='a b c') assert bc1.is_better(bc2) assert not bc2.is_better(bc1) assert bc1.distance() < bc2.distance() and pytest.approx(bc1.distance(), 1.5) bc3 = BestControl(origin_url='AAA', control_url='BBB', ast_dist=43.0, function_dist=3.0, literal_dist=9.7, sha256_matched=False, diff_functions='aaa bbb') assert not bc3.is_good_hit() bc2.literal_dist = -1 assert not bc2.good_hit_as_tuple(200.0) == (False, 'bad_literal_dist')