Beispiel #1
0
def validate_feature_vectors(db, msg, java, extractor):
    """
    Returns True if the features in msg can be validated using only the Mongo-stored artefact (ie. not kafka) and re-calculating. Otherwise False
    Slow and expensive, so only called when --defensive is specified. Extensive checking to validate that key metrics are exactly what they should be so
    that data quality across the pipeline is high.
    """
    assert isinstance(msg, dict)
    assert 'url' in msg
    assert os.path.exists(java)
    assert os.path.exists(extractor)

    jsr = JavascriptArtefact(url=msg['url'],
                             sha256=msg['sha256'],
                             md5=msg['md5'],
                             size_bytes=msg['size_bytes'])
    ret = db.scripts.find_one({'_id': ObjectId(msg['js_id'])})
    if ret is None:
        raise ValueError("Could not locate script: {}".format(jsr.url))
    features, failed, stderr = analyse_script(ret.get('code'),
                                              jsr,
                                              java=java,
                                              feature_extractor=extractor)
    if failed:
        return False

    l = len(features['literals_by_count'])
    o = len(msg['literals_by_count'])
    print(l, " ", o)
    assert o == l
    assert features['statements_by_count'] == msg['statements_by_count']
    assert features['calls_by_count'] == msg['calls_by_count']
    return True
Beispiel #2
0
def iterate(consumer, max, cache, verbose=False):
    assert consumer is not None
    assert cache is not None
    assert max > 0
    # NB: sort each batch in order to maximise performance of fv_cache (sha256 should be sufficient)
    batch_size = 2000
    for batch_of_messages in batch(next_artefact(consumer,
                                                 max,
                                                 javascript_only(),
                                                 verbose=verbose),
                                   n=batch_size):
        if len(batch_of_messages) < batch_size:
            print("WARNING: expected {} messages, got {}".format(
                batch_size, len(batch_of_messages)))
        n_cached = 0
        for r in sorted(batch_of_messages, key=lambda v: v['sha256']):
            jsr = JavascriptArtefact(**r)
            if verbose:
                print(jsr)

            if not jsr.sha256 in cache:
                cache[jsr.sha256] = (jsr.md5, jsr.size_bytes)
                yield jsr
            elif cache[jsr.sha256][0] == jsr.md5 and cache[
                    jsr.sha256][1] == jsr.size_bytes:
                n_cached += 1
            else:  # sha256 collison where md5 and/or size_bytes was not what was expected
                yield jsr
        print("Processed message batch: n={} cached={}".format(
            len(batch_of_messages), n_cached))
        if not isinstance(
                consumer, list
        ):  # HACK: list is used for testing iterate() and it has no commit...
            consumer.commit(
            )  # blocking call for now to ensure we only update the consumer offset at the end of a batch (this will cause dupes, but thats better than loss)
def test_analyse_script_utf8_handling(pytestconfig,
                                      analyse_utf8_expected_results):
    testjs = "{}/src/test-javascript/ca.js".format(pytestconfig.rootdir)
    with open(testjs, 'rb') as fp:
        jsr = JavascriptArtefact(url="file:{}".format(testjs),
                                 origin=None,
                                 sha256="XXX",
                                 md5="XXX")
        byte_content, failed, stderr = analyse_script(
            fp.read(),
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert not failed
        js = json.loads(byte_content.decode())
        # must match results computed by java on the CLI...
        v1 = truncate_literals(js['literals_by_count'])
        v2 = truncate_literals(
            analyse_utf8_expected_results['literals_by_count'])
        assert len(v1) == len(v2)
        assert v1 == v2
        # and that calculate_literal_ distance() is also zero (ie. unicode handling is robust)
        dist, n_not_in_origin, n_not_in_control, diff_lits = calculate_literal_distance(
            v1, v2, fail_if_difference=True)
        assert pytest.approx(dist, 0.0)
        assert n_not_in_origin == 0
        assert n_not_in_control == 0
        assert diff_lits == []
Beispiel #4
0
def test_report_failure():
    producer = mock.Mock()
    artefact = JavascriptArtefact(url='XXX',
                                  content_type='text/javascript',
                                  size_bytes=33,
                                  sha256='affb',
                                  md5='affc')
    report_failure(producer, artefact, 'reason=hello world')
    assert len(producer.method_calls) == 1
    name, args, kwargs = producer.method_calls[0]
    assert name == 'send'
    assert len(args) == 2
    assert args[0] == 'feature-extraction-failures'
    d = args[1].copy()
    assert 'when' in d
    assert isinstance(
        d['when'], str) and len(d['when']) > 10  # FIXME... stronger test... ?
    d.pop('when', None)
    expected_results = {
        'url': 'XXX',
        'sha256': 'affb',
        'md5': 'affc',
        'inline': False,
        'content_type': 'text/javascript',
        'size_bytes': 33,
        'origin': None,
        'reason': 'reason=hello world',
        'js_id': ''
    }
    assert d == expected_results
def test_analyse_script_2(pytestconfig):
    testjs = "{}/src/test-javascript/fieldRequiredWhenNotAfterGoLiveValidation.js".format(
        pytestconfig.rootdir)
    with open(testjs, "rb") as fp:
        jsr = JavascriptArtefact(url="file:{}".format(testjs),
                                 origin=None,
                                 sha256="XXX",
                                 md5="XXX")
        byte_content, failed, stderr = analyse_script(
            fp.read(),
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert not failed
        js = json.loads(byte_content.decode())
        assert js['statements_by_count'] == {
            "FunctionNode": 2,
            "StringLiteral": 13,
            "VariableInitializer": 3,
            "Scope": 1,
            "KeywordLiteral": 3,
            "AstRoot": 1,
            "Assignment": 2,
            "IfStatement": 1,
            "Block": 2,
            "InfixExpression": 10,
            "ExpressionStatement": 4,
            "PropertyGet": 14,
            "ReturnStatement": 2,
            "UnaryExpression": 1,
            "Name": 37,
            "NumberLiteral": 2,
            "ArrayLiteral": 1,
            "VariableDeclaration": 3,
            "FunctionCall": 9,
            "ElementGet": 2,
            "ParenthesizedExpression": 3
        }
        assert js['calls_by_count'] == {
            "val": 1,
            "F$": 3,
            "addMethod": 1,
            "get": 1,
            "attr": 1,
            "split": 1,
            "add": 1
        }
        assert js['literals_by_count'] == {
            " ": 1,
            "0": 2,
            "#IsAfterGoLive": 1,
            "INPUT": 1,
            "requiredwhennotaftergolivevalidation": 1,
            "True": 1,
            "class": 1,
            "testrequiredwhennotaftergolivevalidation": 3,
            "SELECT": 1
        }
def calc_vector(filename):
    with open(filename, 'rb') as fp:
        jsr = JavascriptArtefact(url="file:{}".format(filename),
                                 sha256="XXX",
                                 md5="XXX")
        ret, failed, stderr = analyse_script(fp.read(),
                                             jsr,
                                             feature_extractor=args.extractor)
        if failed:
            raise Exception(stderr)
        return json.loads(ret.decode('utf-8'))
Beispiel #7
0
def find_or_update_analysis_content(db,
                                    m,
                                    fail_iff_not_found=False,
                                    defensive=False,
                                    java=None,
                                    extractor=None,
                                    force=False):
    assert isinstance(m, dict)
    assert all([
        'js_id' in m, 'url' in m, 'sha256' in m, 'md5' in m, 'size_bytes' in m
    ])

    js_id = m.get('js_id')
    assert len(js_id) > 0

    # NB: due to an error in processing, I had to throw away the db.analysis_content collection, so records may be missing. Sigh 8th June 2020
    if not force:
        byte_content_doc = db.analysis_content.find_one({'js_id': js_id})
        if fail_iff_not_found and byte_content_doc is None:  # prevent infinite recursion
            raise ValueError("No data for {}".format(js_id))
        elif byte_content_doc is None:
            print("WARNING: unable to locate analysis content for {}".format(
                js_id))
    else:
        byte_content_doc = None

    if byte_content_doc is None:
        code_bytes, js_id = get_script(db, js_id)
        assert code_bytes is not None
        jsr = JavascriptArtefact(url=m.get('url'),
                                 sha256=m.get('sha256'),
                                 md5=m.get('md5'),
                                 size_bytes=m.get('size_bytes'),
                                 js_id=js_id)
        vector_as_bytes, failed, stderr = analyse_script(
            code_bytes, jsr, java=java, feature_extractor=extractor)
        if failed:
            raise ValueError("Could not analyse artefact: js_id={}\n{}".format(
                js_id, stderr))
        save_analysis_content(db, jsr, vector_as_bytes)

        if defensive:
            # check that artefact hashes match the actual content
            assert hashlib.sha256(code_bytes).hexdigest() == m.get('sha256')
        return find_or_update_analysis_content(
            db, m, fail_iff_not_found=True)  # this time it should be found!

    assert 'analysis_bytes' in byte_content_doc
    byte_content = byte_content_doc.get('analysis_bytes')
    assert isinstance(byte_content, bytes)
    return json.loads(byte_content.decode())
def test_javascript_artefact():
    # usual constructor parameters for most apps
    jsr1 = JavascriptArtefact(sha256="XYZ", md5="XXX", size_bytes=279, url="http://foo/bar/baz")
    assert jsr1.sha256 == "XYZ"
    assert jsr1.md5 == "XXX"
    assert jsr1.size_bytes == 279
    assert jsr1.url == "http://foo/bar/baz" 
    assert jsr1.origin is None # if origin is not specified it must be None
    assert jsr1.inline == False
    assert jsr1.content_type == "text/javascript"
    assert isinstance(jsr1.when, str) and len(jsr1.when) > 10

    # but etl_upload requires compatibility with 'checksum' instead of md5
    jsr2 = JavascriptArtefact(sha256="JJJ", checksum="XXX", size_bytes=122999, url=None, origin="http://bar/baz")
    assert jsr2.md5 == "XXX"
    assert jsr2.size_bytes == 122999
    assert jsr2.origin == "http://bar/baz"

    # and that sorting is correctly done via the md5 field
    j3 = JavascriptArtefact(md5='333', url=None, sha256='') 
    j9 = JavascriptArtefact(md5='999', url=None, sha256='')
    j1 = JavascriptArtefact(md5='111', url=None, sha256='') 
    l = [j3, j9, j1]
    assert sorted(l) == [j1, j3, j9]
def test_analyse_script_failure(pytestconfig):
    # mozilla rhino cant handle all JS... so check that failure path is as expected
    testjs = "{}/src/test-javascript/google-analytics.js".format(
        pytestconfig.rootdir)
    with open(testjs, "rb") as fp:
        jsr = JavascriptArtefact(url="file:{}".format(testjs),
                                 origin=None,
                                 sha256="XXX",
                                 md5="XXX")
        json, failed, stderr = analyse_script(
            fp.read(),
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert failed
        assert "missing ; after for-loop initializer" in stderr
def test_analyse_script(pytestconfig, analyse_script_expected_results):
    testjs = "{}/src/test-javascript/banners.js".format(pytestconfig.rootdir)
    with open(testjs, "rb") as fp:
        jsr = JavascriptArtefact(url="file:{}".format(testjs),
                                 origin=None,
                                 sha256='XXX',
                                 md5='XXX')
        byte_content, failed, stderr = analyse_script(
            fp.read(),
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert not failed
        js = json.loads(byte_content.decode())
        js.pop('id', None)  # not interested in this field anymore -- obsolete
        assert js == analyse_script_expected_results
Beispiel #11
0
def resolve_feature_vector(db, message):
    url = message.get('id')
    d = {'first_url': url}
    # 1. find url _id
    ret = db.urls.find_one(
        {'url': url}
    )  # TODO FIXME: do we care which timestamp of the url we get? nah... not for now
    if not ret:
        raise Exception("Could not find URL in MongoDB: {}".format(url))
    id = ret.get('_id')

    # 2. lookup url_id in script_urls collection
    ret = db.script_url.find_one({'url_id': id})
    if not ret:
        raise Exception("Could not find script_url in MongoDB: {} {}".format(
            id, url))
    script_id = ret.get('script')

    # 3. lookup script_id in scripts collection
    ret = db.scripts.find_one({'_id': script_id})
    if ret:
        d.update({
            'sha256': ret.get('sha256'),
            'md5': ret.get('md5'),
            'size_bytes': ret.get('size_bytes')
        })
    else:
        raise Exception("Could not find script {} {}".format(script_id, url))
    code = ret.get('code')

    # 4. finally we want to avoid re-computing the feature vector (slow) so we look it up in Mongo
    ret = db.statements_by_count.find_one({'url': url})
    if ret:
        d.update(**ret)
        d.pop('_id', None)
        d.pop('url', None)
    else:
        sha256 = message.get('sha256')
        md5 = message.get('md5')
        jsr = JavascriptArtefact(url=url, sha256=sha256, md5=md5)
        ret, failed, stderr = analyse_script(code, jsr)
        if not failed:
            d.update(**ret['statements_by_count'])
        else:
            raise Exception(stderr)

    return d
def strategy_1_pyrequests(db, producer, artefact_url, cited_on, **kwargs):
    assert db is not None
    assert producer is not None
    assert len(artefact_url) > 0
    assert len(cited_on) > 0

    ua = kwargs.get('ua', None)
    referrer = kwargs.get('referrer',
                          kwargs.get('referer',
                                     None))  # one r or two.. your choice ;-)
    headers = {}
    if ua is not None:
        assert len(ua) > 0
        headers.update({'User-Agent': ua})
    if referrer is not None:
        assert len(referrer) > 0
        headers.update({'Referer': referrer})
    try:
        resp = requests.get(artefact_url, headers=headers, verify=False)
        if resp.status_code == 200:
            content = resp.content
            sha256 = hashlib.sha256(content).hexdigest()
            artefact = JavascriptArtefact(url=artefact_url,
                                          sha256=sha256,
                                          md5=hashlib.md5(content).hexdigest(),
                                          size_bytes=len(content),
                                          origin=cited_on)
            # NB: ignore return result from save_artefact()
            ret, was_cached = save_artefact(
                db, artefact, None,
                content=content)  # will save to Mongo AND Kafka visited topic
            producer.send(kwargs.get('to'), ret, key=sha256.encode('utf-8'))
            return artefact
        else:
            return None
    except Exception as e:
        print(
            "Failed to download using python requests: {} (exception follows)".
            format(artefact_url))
        print(str(e))
        return None
def test_find_best_control(pytestconfig):
    # def find_best_control(input_features, controls_to_search, max_distance=100.0, db=None, debug=False)
    js_file = "{}/src/test-javascript/json2_4.9.2.min.js".format(
        pytestconfig.rootdir)
    with open(js_file, 'rb') as fp:
        content = fp.read()
        jsr = JavascriptArtefact(url=js_file,
                                 sha256=hashlib.sha256(content).hexdigest(),
                                 md5=hashlib.md5(content).hexdigest(),
                                 size_bytes=len(content))
        input_features, failed, stderr = analyse_script(
            content,
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert not failed
        db = mock.Mock()
        db.javascript_controls.find_one.return_value = {
            'literals_by_count': {
                'blah': 0,
                'blah': 0
            }
        }
        d = json.loads(input_features.decode())
        d['js_id'] = 'XXXXXXXXXXXXXXXXXXXXXXXX'
        d['sha256'] = 'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
        best_control, next_best_control = find_best_control(db, d, debug=True)
        # EXPECTED RESULTS:
        # best_control = BestControl(control_url='https://cdn.jsdelivr.net/gh/WordPress/[email protected]//wp-includes/js/json2.min.js', origin_url='/home/acas/src/pi-cluster-ansible-cfg-mgmt/src/test-javascript/json2_4.9.2.min.js', sha256_matched=False, ast_dist=0.0, function_dist=0.0, diff_functions='', cited_on=None)
        # next_best_control has a larger ast_dist due to a different version, but it has the same function calls so the dist is zero
        print(best_control)
        print(next_best_control)
        # NB: with only 1 control in the test, next_best must have infinite distances
        assert next_best_control.ast_dist == pytest.approx(7.54983)
        assert next_best_control.function_dist == pytest.approx(0.0)
        assert best_control.ast_dist <= 0.0000001
        assert best_control.function_dist <= 0.000001
        assert pytest.approx(best_control.literal_dist, -1.0)
        assert pytest.approx(next_best_control.literal_dist, -1.0)
def test_analyse_script_unpacker(pytestconfig):
    expected_results = {
        'packed1.js': True,
        'packed2.js': True,
        'packed3.js': True
    }

    for testjs in ["packed1.js", "packed2.js", "packed3.js"]:
        filename = "{}/src/test-javascript/{}".format(pytestconfig.rootdir,
                                                      testjs)
        jsr = JavascriptArtefact(url="file:{}".format(filename),
                                 origin=None,
                                 sha256='XXX',
                                 md5='XXX')
        byte_content, failed, stderr = analyse_script(
            filename,
            jsr,
            feature_extractor="{}/src/extract-features.jar".format(
                pytestconfig.rootdir))
        assert failed == expected_results[testjs]
        if not failed:
            assert byte_content[0] == ord('{')
Beispiel #15
0
    rec = message.value
    if args.v:
        print(rec)
    try:
        assert 'calls_by_count' in rec
        assert 'statements_by_count' in rec
        assert 'url' in rec and rec['url'] is not None
        assert 'origin' in rec and rec['origin'] is not None
    except AssertionError:
        print(rec)
        n_bad += 1
        continue

    jsr = JavascriptArtefact(url=rec['url'],
                             origin=rec['origin'],
                             sha256=rec.get('sha256', ''),
                             md5=rec.get('md5', ''),
                             inline=rec.get('inline'))
    script, url_entry = find_script(db, jsr.url, want_code=False)
    if script is None or not '_id' in script:
        print(script)
        print(url_entry)
        n_bad += 1
        continue
    js_id = script.get('_id')
    save_ast_vector(db, jsr, rec['statements_by_count'], js_id=js_id)
    save_call_vector(db, jsr, rec['calls_by_count'], js_id=js_id)

    n += 1
    if n % 1000 == 0:
        print("Processed {} records.".format(n))
        print("Bad data - skipping... {}".format(hit))
        continue

    assert 'origin_js_id' in hit
    js_id = hit.get('origin_js_id')
    if js_id is None:
        print("Bad data - no origin_js_id... skipping".format(hit)) 
        continue

    ret = db.scripts.find_one({ '_id': ObjectId(js_id) })
    if ret is None:  # should not happen... but if it does...
        print("Unable to locate {} is db.scripts... skipping".format(js_id))
        continue
    content = ret.get('code') 
    jsr = JavascriptArtefact(url=hit.get('origin_url'), 
                             sha256=hashlib.sha256(content).hexdigest(), 
                             md5=hashlib.md5(content).hexdigest(), 
                             inline=False)
    m, failed, stderr = analyse_script(content, jsr, java=args.java, feature_extractor=args.extractor)
    if failed:
       n_failed += 1
       continue
    m.update({ 'origin': hit.get('cited_on'), 'js_id': js_id })
    assert 'js_id' in m and len(m['js_id']) > 0  # PRE-CONDITION: ensure hits have origin_js_id field set
    best_control, next_best_control = find_best_control(m, all_controls, db=db)
    d = asdict(best_control) # NB: all fields of the model are sent to output kafka topic and Mongo

    # 2a. also send results to MongoDB for batch-oriented applications and for long-term storage
    # POST-CONDITIONS which MUST be maintained are checked before pushing to topic
    assert 'cited_on' in d and len(d['cited_on']) > 0
    assert 'origin_url' in d and len(d['origin_url']) > 0
    assert isinstance(d['origin_js_id'], str) or d['origin_js_id'] is None
Beispiel #17
0
        mongo.close()
    except NameError:
        pass  # NameError occurs when using --file as consumer has not been setup since it is not required
    rm_pidfile('pid.eval.controls', root='.')
    sys.exit(0)


if args.v:
    print(db.javascript_controls.distinct('family'))

if args.file:
    with open(args.file, 'rb') as fp:
        content = fp.read()
        jsr = JavascriptArtefact(url=args.file,
                                 sha256=hashlib.sha256(content).hexdigest(),
                                 js_id='0' * 24,
                                 md5=hashlib.md5(content).hexdigest(),
                                 size_bytes=len(content))
        byte_content, failed, stderr = analyse_script(
            content, jsr, feature_extractor=args.extractor)
        if failed:
            raise ValueError("Failed to analyse script: {}\n{}".format(
                jsr, stderr))
        m = json.loads(byte_content.decode())
        m.update(asdict(jsr))
        print(m)
        best_control, next_best_control = find_best_control(
            db, m, max_distance=args.max_distance, debug=True)
        update_literal_distance(db,
                                best_control,
                                m['literals_by_count'],
Beispiel #18
0
def save_control(db,
                 url,
                 family,
                 variant,
                 version,
                 force=False,
                 refuse_hashes=None,
                 provider='',
                 java='/usr/bin/java',
                 feature_extractor=None,
                 content=None):
    """
   Update all control related data. Note callers must supply refuse_hashes (empty set) or an error will result

   Returns JavascriptArtefact representing control which has had its state updated into MongoDB
   """
    assert url is not None
    assert family is not None
    assert version is not None
    if content is None:
        resp = requests.get(url)
        if resp.status_code != 200:
            raise ValueError("Failed to fetch [{}] {}".format(
                resp.status_code, url))
        content = resp.content

    sha256 = hashlib.sha256(content).hexdigest()
    md5 = hashlib.md5(content).hexdigest()
    jsr = JavascriptArtefact(when=str(datetime.utcnow()),
                             sha256=sha256,
                             md5=md5,
                             url=url,
                             inline=False,
                             content_type='text/javascript',
                             size_bytes=len(content))
    if jsr.size_bytes < 1000:
        print(
            "Refusing artefact as too small to enable meaningful vector comparison: {}"
            .format(jsr))
        return jsr

    if not force and jsr.sha256 in refuse_hashes:
        print("Refusing to update existing control as dupe: {}".format(jsr))
        return jsr

    bytes_content, failed, stderr = analyse_script(
        content, jsr, java=java, feature_extractor=feature_extractor)
    if failed:
        raise ValueError('Could not analyse script {} - {}'.format(
            jsr.url, stderr))
    ret = json.loads(bytes_content.decode())
    cntrl_url, subfamily = identify_control_subfamily(jsr.url)
    ret.update({
        'family': family,
        'release': version,
        'variant': variant,
        'origin': url,
        'sha256': sha256,
        'md5': md5,
        'size_bytes': len(content),
        'do_not_load':
        False,  # all controls loaded by default except alpha/beta/release candidate
        'provider': provider,
        'subfamily': subfamily
    })
    #print(ret)
    assert 'sha256' in ret
    assert 'md5' in ret
    assert 'size_bytes' in ret

    # NB: only one control per url/family pair (although in theory each CDN url is enough on its own)
    resp = db.javascript_controls.find_one_and_update(
        {
            'origin': url,
            'family': family
        }, {"$set": ret}, upsert=True)
    db.javascript_control_code.find_one_and_update({'origin': url}, {
        "$set": {
            'origin': url,
            'code': Binary(content),
            'analysis_bytes': bytes_content,
            "last_updated": jsr.when
        }
    },
                                                   upsert=True)
    update_control_summary(db, url, ret['statements_by_count'],
                           ret['calls_by_count'], ret['literals_by_count'])
    return jsr
Beispiel #19
0
def report_vectors(db, artefact_fname, control_url: str, artefact_url: str):
    assert len(control_url) > 0 and len(artefact_url) > 0
    assert os.path.exists(artefact_fname)

    cntrl = db.javascript_controls.find_one({'origin': control_url})
    assert cntrl is not None
    assert 'literals_by_count' in cntrl
    assert 'statements_by_count' in cntrl
    assert 'calls_by_count' in cntrl

    # we must analyse the artefact to get the vectors for the artefact (since its too expensive to search kafka for it)
    jsr = JavascriptArtefact(url=artefact_url,
                             sha256='XXX',
                             md5='XXX',
                             inline=False)
    byte_content, failed, stderr = analyse_script(artefact_fname, jsr)
    if failed:
        raise ValueError("Unable to analyse script: {}\n{}".format(
            artefact_url, stderr))
    ret = json.loads(byte_content.decode())
    assert 'literals_by_count' in ret
    assert 'statements_by_count' in ret
    assert 'calls_by_count' in ret

    # ok, now we have the vectors, lets report the comparison between control and artefact...
    v1, ast1_sum = calculate_ast_vector(cntrl['statements_by_count'])
    v2, ast2_sum = calculate_ast_vector(ret['statements_by_count'])
    print("Control url is: {}".format(control_url))
    print("Artefact url is: {}".format(artefact_url))
    print("AST vector magnitudes: control={} artefact={}".format(
        ast1_sum, ast2_sum))
    print(v1)
    print(v2)
    dist = compute_distance(v1, v2)
    print("AST distance: {:.2f}".format(dist))
    diff_features = []
    for feature_idx, feature in enumerate(ast_feature_list):
        if v1[feature_idx] != v2[feature_idx]:
            diff_features.append(
                (feature, abs(v1[feature_idx] - v2[feature_idx])))
    items = [
        '{} ({})'.format(t[0], t[1])
        for t in sorted(diff_features, key=lambda t: t[1])
    ]
    print("AST features which are different: ", ','.join(items))

    diffs = []
    all_calls = set(cntrl['calls_by_count'].keys()).union(
        ret['calls_by_count'].keys())
    for fn in all_calls:
        cntl_cnt = cntrl['calls_by_count'].get(fn, 0)
        artefact_cnt = ret['calls_by_count'].get(fn, 0)
        if cntl_cnt != artefact_cnt:
            diffs.append(fn)
    v1, fn1_sum = calculate_vector(cntrl['calls_by_count'],
                                   feature_names=all_calls)
    v2, fn2_sum = calculate_vector(ret['calls_by_count'],
                                   feature_names=all_calls)
    print("Function call magnitudes: control={} artefact={}".format(
        fn1_sum, fn2_sum))
    print(v1)
    print(v2)
    dist = compute_distance(v1, v2)
    print("Function call distance: {:.2f}".format(dist))
    if len(diffs) == 0:
        print("All functions called the expected number of times.")
    else:
        print("Functions not called the expected number of times: {}".format(
            ' '.join(diffs)))
    t = calculate_literal_distance(
        truncate_literals(cntrl['literals_by_count']),
        truncate_literals(ret['literals_by_count']))
    literal_dist, n_not_in_origin, n_not_in_control, diff_literals = t
    print("Literal distance is: {}".format(literal_dist))
    print("Number of literals in control but not origin: {}".format(
        n_not_in_origin))
    print("Number of literals in origin but not control: {}".format(
        n_not_in_control))
    print("Diff literals: {}".format(diff_literals))
Beispiel #20
0
if __name__ == "__main__":
   args = a.parse_args()
   mongo = pymongo.MongoClient(args.db, args.port, username=args.dbuser, password=str(args.dbpassword))
   db = mongo[args.dbname]
   cursor = db.javascript_control_code.find({}, no_cursor_timeout=True) # long-running find so we try to avoid it being killed prematurely...
   with cursor:
       for rec in cursor:
           assert 'code' in rec
           control_url = rec.get('origin')
           assert control_url.startswith("http")
           if args.v:
               print(control_url)

           # recalculate vector?
           if args.recalc:
               jsr = JavascriptArtefact(url=control_url, sha256='XXX', md5='YYY', inline=False)  # only url matters for analyse script
               vectors, failed, stderr = analyse_script(rec.get('code'), jsr, java=args.java, feature_extractor=args.extractor)
               assert not failed
               assert isinstance(vectors, bytes)
               print(jsr.url)
               required_hash = hashlib.sha256(vectors).hexdigest()
               db.javascript_control_code.update_one({ '_id': rec.get('_id') }, 
                                                 { "$set": { "analysis_bytes": Binary(vectors) } })
           else: 
               vectors = rec.get('analysis_bytes')
               assert vectors is not None
               assert isinstance(vectors, bytes)
           
           d = json.loads(vectors)
           update_control_summary(db, control_url, d['statements_by_count'], d['calls_by_count'], d['literals_by_count'])
           db.javascript_controls.update_one({ 'origin': control_url }, { '$set': {