Esempio n. 1
0
def update_literal_distance(db,
                            hit: BestControl,
                            ovec,
                            fail_if_difference=False):
    assert hit is not None
    assert ovec is not None
    control_literal_vector = lookup_control_literals(
        db, hit.control_url,
        debug=fail_if_difference) if len(hit.control_url) > 0 else None
    if control_literal_vector is None:
        hit.literal_dist = -1.0
        hit.literals_not_in_origin = -1
        hit.literals_not_in_control = -1
        hit.n_diff_literals = -1
        hit.diff_literals = ''
        return
    v1 = truncate_literals(control_literal_vector)
    v2 = truncate_literals(ovec)
    try:
        t = calculate_literal_distance(v1,
                                       v2,
                                       fail_if_difference=fail_if_difference)
        hit.literal_dist, hit.literals_not_in_origin, hit.literals_not_in_control, diff_literals = t
        hit.n_diff_literals = len(diff_literals)
        hit.diff_literals = fix_literals(diff_literals)
    except ValueError as ve:
        raise ve
def test_best_control_infinity_handling():
    bc1 = BestControl(literal_dist=float('Inf'), ast_dist=float('Inf'), function_dist=float('Inf'), 
                      control_url='', origin_url='', sha256_matched=False, diff_functions='')
    bc2 = BestControl(literal_dist=0.0, ast_dist=0.0, function_dist=0.0,
                      control_url='XXX', origin_url='YYY', sha256_matched=True, diff_functions='')

    assert bc2.is_better(bc1)
    assert not bc1.is_better(bc2)
def test_best_control_max_distance():
    bc2 = BestControl(literal_dist=7.0, ast_dist=12.0, function_dist=3.0, diff_functions='                  ', # need 12 to disable original criteria
                      control_url='XXX', origin_url='YYY', sha256_matched=True)
    ok, reason = bc2.good_hit_as_tuple(100.0) # since distprod is (12+3) * (3+7) == 150
    assert ok and reason == 'good_two_smallest_distances' # must fail distance() test
    assert bc2.good_hit_as_tuple(200.0) == (True, 'dist_lt_200.0')

    bc2.function_dist=22
    assert not bc2.is_good_hit()
Esempio n. 4
0
def process_hit(db,
                all_controls,
                hit: BestControl,
                producer,
                reason,
                stats=None):
    assert stats is not None
    dist = hit.ast_dist
    assert dist >= 0.0
    assert len(reason) > 0

    origin_fields = as_url_fields(hit.origin_url, prefix='origin')
    host = origin_fields.get('origin_host')
    if host is None:
        return False

    d = asdict(hit)
    d.pop('diff_functions', None)

    u = hit.control_url
    assert u in all_controls  # should have been checked before call
    assert len(u) > 0  # bad hits should not come to process_hit()
    d.update(origin_fields)

    # cited_on URL (aka. HTML page) iff specified
    d.update(as_url_fields(hit.cited_on, prefix='cited_on'))
    d['control_family'] = all_controls[u].get('family')

    # good hits get sent to the suspicious analysis pipeline
    if not reason in stats:
        stats[reason] = 0
    stats[reason] += 1
    dc = d.copy()
    dc.pop('_id', None)
    dc['diff_functions'] = hit.diff_functions_as_list()
    assert isinstance(dc['diff_functions'], list)
    if dc['cited_on_host'] is None:  # some buggy records - rare
        print("Bad data - skipping... {}".format(dc))
        return

    producer.send('etl-good-hits', dc)
    # sanity check: if different literals are found then the string must be greater than zero length
    assert (hit.n_diff_literals > 0
            and len(dc.get('diff_literals')) > 0) or hit.n_diff_literals <= 0
    db.etl_hits.insert_one(
        dc
    )  # BREAKING CHANGE: dc['diff_functions'] is now a list not a comma separated string, but literals is still a string
Esempio n. 5
0
    n_good = n_not_good = n = 0
    save_pidfile('pid.etl.hits')
    all_controls = {}
    for t in load_controls(db, verbose=args.v):
        assert isinstance(t, tuple)
        assert len(t) == 3
        assert isinstance(t[0], dict)
        assert 'origin' in t[0]
        all_controls[t[0].get('origin')] = t[0]

    stats = {}
    for r in consumer:
        n += 1
        d = r.value
        d.pop('origin_vectors_sha256', None)
        hit = BestControl(**d)

        if args.v:
            if n % 10000 == 0:
                print("Processed {} records ({} not good). {}".format(
                    n, n_not_good, str(datetime.utcnow())))
                print(stats)
        if n > args.n:  # TODO FIXME: will lose last message unless we exit before kafka auto commit occurs (but only if --n specified)
            break

        # 1. if xref is missing/empty reject the entire record
        if hit.xref is None or len(hit.xref) == 0:
            n_bad += 1
            continue

        # 2. bad AST*function call product (over threshold)? Or not a hit? reject entire record
def iterate(consumer, max, verbose, threshold):
    for r in next_artefact(consumer,
                           max,
                           lambda v: v['ast_dist'] > threshold,
                           verbose=verbose):
        yield BestControl(**r)
Esempio n. 7
0
def find_best_control(db,
                      input_features,
                      max_distance=200.0,
                      debug=False,
                      control_cache=None):
    """
   Search all controls with AST vector magnitudes within max_distance and find the best hit (lowest product of AST*call distance)
   against suitable controls. Does not currently use literal distance for the calculation. Could be improved.... returns up to two hits
   representing the best and next best hits (although the latter may be None).
   """
    assert db is not None
    origin_url = input_features.get('url', input_features.get(
        'id'))  # LEGACY: url field used to be named id field
    cited_on = input_features.get(
        'origin', None
    )  # report owning HTML page also if possible (useful for data analysis)
    origin_js_id = input_features.get(
        "js_id",
        None)  # ensure we can find the script directly without URL lookup
    if isinstance(origin_js_id, tuple) or isinstance(
            origin_js_id, list
    ):  # BUG FIXME: should not be a tuple but is... where is that coming from??? so...
        origin_js_id = origin_js_id[0]
    assert isinstance(origin_js_id, str) and len(origin_js_id) == 24

    best_distance = float('Inf')
    input_ast_vector, ast_sum = calculate_ast_vector(
        input_features['statements_by_count'])  # NB: UNweighted vector
    fcall_sum = sum(input_features['calls_by_count'].values())
    best_control = BestControl(control_url='',
                               origin_url=origin_url,
                               cited_on=cited_on,
                               sha256_matched=False,
                               ast_dist=float('Inf'),
                               function_dist=float('Inf'),
                               literal_dist=0.0,
                               diff_functions='',
                               origin_js_id=origin_js_id)
    second_best_control = None

    # we open the distance to explore "near by" a little bit... but the scoring for these hits is unchanged
    if debug:
        print("find_best_control({})".format(origin_url))
    plausible_controls = find_plausible_controls(db,
                                                 ast_sum,
                                                 fcall_sum,
                                                 max_distance=max_distance)
    feasible_controls = find_feasible_controls(db,
                                               plausible_controls,
                                               debug=debug,
                                               control_cache=control_cache)
    for fc_tuple in feasible_controls:
        control, control_ast_sum, control_ast_vector, control_call_vector = fc_tuple  # NB: unweighted ast vector
        assert isinstance(control, dict)
        assert control_ast_sum > 0
        assert isinstance(control_ast_vector, list)
        control_url = control.get('origin')

        # compute what we can for now and if we can update it later we will. Otherwise the second_best control may have some fields not-computed
        new_distance, ast_dist, call_dist, diff_functions = distance(
            input_ast_vector,
            control_ast_vector,
            input_features['calls_by_count'],
            control_call_vector,
            debug=debug)
        if call_dist < 5.0 and new_distance > max_distance:
            print(
                "WARNING: rejecting possibly feasible control due to bad total distance: {} {} {} {} {}"
                .format(new_distance, ast_dist, call_dist, control_url,
                        origin_url))
        if new_distance < best_distance and new_distance <= max_distance:
            if debug:
                print("Got good distance {} for {} (was {}, max={})".format(
                    new_distance, control_url, best_distance, max_distance))
            new_control = BestControl(
                control_url=
                control_url,  # control artefact from CDN (ground truth)
                origin_url=origin_url,  # JS at spidered site
                origin_js_id=origin_js_id,
                cited_on=cited_on,
                sha256_matched=False,
                ast_dist=ast_dist,
                function_dist=call_dist,
                literal_dist=0.0,
                diff_functions=' '.join(diff_functions))

            # NB: look at product of two distances before deciding to update best_* - hopefully this results in a lower false positive rate
            #     (with accidental ast hits) as the number of controls in the database increases
            if best_control.is_better(new_control, max_distance=max_distance):
                second_dist = second_best_control.distance(
                ) if second_best_control is not None else 0.0
                if second_best_control is None or second_dist > new_control.distance(
                ):
                    if debug:
                        print(
                            "NOTE: improved second_best control was {} now is {}"
                            .format(second_best_control, new_control))
                    second_best_control = new_control
                # NB: dont update best_* since we dont consider this hit a replacement for current best_control
            else:
                best_distance = new_distance
                second_best_control = best_control
                best_control = new_control

                if best_distance < 0.00001:  # small distance means we can try for a hash match against control?
                    assert control_url == best_control.control_url
                    hash_match = (
                        control['sha256'] == input_features['sha256'])
                    best_control.sha256_matched = hash_match
                    break  # save time since we've likely found the best control but this may mean next_best_control is not second best in rare cases
        else:
            if debug:
                print(
                    "Rejecting control {} ast_dist={} fcall_dist={} total={}".
                    format(control['origin'], ast_dist, call_dist,
                           new_distance))
    # NB: literal fields in best_control/next_best_control are updated elsewhere... not here
    return (best_control, second_best_control)
def test_process_hit_success():
    db = mock.Mock()
    producer = mock.Mock()
    control_url = 'https://cdn.com/path/to/artefact.js'
    bc = BestControl(control_url=control_url,
                     origin_url='https://some.web.site/somewhere.js',
                     cited_on='https://some.web.site/somewhere.html',
                     origin_js_id='12345',
                     sha256_matched=True,
                     diff_functions='',
                     ast_dist=0.0,
                     literal_dist=0.0,
                     function_dist=0.0)
    all_controls = {
        control_url: {
            'control_url': control_url,
            'literals_by_count': {
                'a': 1
            }
        }
    }
    db.analysis_content.find_one.return_value = {
        'analysis_bytes': b'{"calls_by_count":{}}'
    }
    process_hit(db, all_controls, bc, producer, 'distance_lt_200.0', stats={})
    #expected_find_call = mock.call.analysis_content.find_one({'js_id': '12345' })
    expected_insert_call = mock.call.etl_hits.insert_one({
        'control_url':
        'https://cdn.com/path/to/artefact.js',
        'origin_url':
        'https://some.web.site/somewhere.js',
        'sha256_matched':
        True,
        'ast_dist':
        0.0,
        'function_dist':
        0.0,
        'cited_on':
        'https://some.web.site/somewhere.html',
        'literal_dist':
        0.0,
        'xref':
        None,
        'literals_not_in_control':
        -1,
        'literals_not_in_origin':
        -1,
        'n_diff_literals':
        -1,
        'diff_literals':
        '',
        'origin_host':
        'some.web.site',
        'origin_has_query':
        False,
        'origin_port':
        443,
        'origin_js_id':
        '12345',
        'origin_scheme':
        'https',
        'origin_path':
        '/somewhere.js',
        'cited_on_host':
        'some.web.site',
        'cited_on_has_query':
        False,
        'cited_on_port':
        443,
        'cited_on_scheme':
        'https',
        'cited_on_path':
        '/somewhere.html',
        'control_family':
        None,
        'diff_functions': []
    })
    assert len(db.method_calls) == 1
    assert db.method_calls == [expected_insert_call]
    assert producer.method_calls == [
        mock.call.send(
            'etl-good-hits', {
                'control_url': 'https://cdn.com/path/to/artefact.js',
                'origin_url': 'https://some.web.site/somewhere.js',
                'sha256_matched': True,
                'ast_dist': 0.0,
                'function_dist': 0.0,
                'cited_on': 'https://some.web.site/somewhere.html',
                'origin_js_id': None,
                'literal_dist': 0.0,
                'xref': None,
                'literals_not_in_control': -1,
                'literals_not_in_origin': -1,
                'n_diff_literals': -1,
                'diff_literals': '',
                'origin_js_id': '12345',
                'origin_host': 'some.web.site',
                'origin_has_query': False,
                'origin_port': 443,
                'origin_scheme': 'https',
                'origin_path': '/somewhere.js',
                'cited_on_host': 'some.web.site',
                'cited_on_has_query': False,
                'cited_on_port': 443,
                'cited_on_scheme': 'https',
                'cited_on_path': '/somewhere.html',
                'control_family': None,
                'diff_functions': []
            })
    ]
def test_best_control():
    bc1 = BestControl(origin_url='XXX', control_url='YYY', literal_dist=0.0, ast_dist=1.0, function_dist=1.5, sha256_matched=False, diff_functions='parseJSON runAJAX')
    assert bc1.origin_url == 'XXX'
    assert bc1.control_url == 'YYY'
    assert bc1.ast_dist == pytest.approx(1.0)
    assert bc1.function_dist == pytest.approx(1.5)
    assert not bc1.sha256_matched
    assert bc1.diff_functions == 'parseJSON runAJAX' 
    assert bc1.origin_js_id is None
    assert bc1.cited_on is None
    assert pytest.approx(bc1.literal_dist, -1.0)
    assert bc1.is_good_hit()

    bc2 = BestControl(origin_url='AAA', control_url='BBB', ast_dist=10.0, function_dist=1.5, literal_dist=0.0, sha256_matched=False, diff_functions='a b c')
    assert bc1.is_better(bc2)
    assert not bc2.is_better(bc1)
    assert bc1.distance() < bc2.distance() and pytest.approx(bc1.distance(), 1.5)

    bc3 = BestControl(origin_url='AAA', control_url='BBB', ast_dist=43.0, function_dist=3.0, literal_dist=9.7, sha256_matched=False, diff_functions='aaa bbb')
    assert not bc3.is_good_hit()

    bc2.literal_dist = -1
    assert not bc2.good_hit_as_tuple(200.0) == (False, 'bad_literal_dist')