Example #1
0
def crawl_debian_security(local, parallel=True):
    # Works faster with parallelism.
    cve_re = re.compile("^(CVE\S+)")
    list_url = "https://salsa.debian.org/security-tracker-team/security-tracker/raw/master/data/CVE/list"

    if not local:
        raw = net.get_raw_resource(list_url)
    else:
        path = misc.repo_path("security-tracker-team", "security-tracker")
        path = os.path.join(path, "raw", "master", "data", "CVE", "list")
        with open(path, "r") as f:
            raw = f.read()

    indices = [x.start() for x in re.finditer(r"^CVE", raw, re.MULTILINE)]
    sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])]
    cve_index = _index_cves()

    bar = misc.KnownLengthBar(maxval=len(sub_strings), parallel=parallel)

    def worker(sub_string):
        cve_string = re.findall(cve_re, sub_string)[0]
        cve = cve_index.get(cve_string)
        queries = []
        if cve:
            url_strings = misc.flatten_list([
                re.findall(url_re, sub_string) for url_re in urls_re_whitelist
            ])
            queries = _get_queries_for_cve_url(cve_string, url_strings,
                                               ["DebianSec"])
        bar.update()
        return queries

    return _process_queries_from_workers(worker, sub_strings, parallel, bar)
Example #2
0
def crawl_django():
    cve_re = re.compile(r":cve:`(\S+)`")
    commit_re = re.compile(r"(https://github.com/\S+/[a-f0-9]+)")
    cve_index = _index_cves()
    raw = net.get_raw_resource(
        "https://raw.githubusercontent.com/django/django/master/docs/releases/security.txt"
    )

    indices = [x.start() for x in re.finditer(r":cve:", raw)]
    sub_strings = [raw[s:f] for s, f in zip(indices, indices[1:] + [len(raw)])]

    bar = misc.KnownLengthBar(maxval=len(indices), parallel=False)

    def worker(sub_string):
        queries = []
        cve_string = "CVE-" + re.findall(cve_re, sub_string)[0]
        cve = cve_index.get(cve_string)
        if not cve:
            print "CVE not found?!: " + cve_string
            return []

        # Find the URLs
        url_strings = re.findall(commit_re, sub_string)
        if url_strings:
            queries = _get_queries_for_cve_url(cve_string, url_strings,
                                               ["Python", "Django"])
        return queries

    return _process_queries_from_workers(worker, sub_strings, False, bar)
Example #3
0
def extract_from_chromium_codereview(url_string):
    if "codereview.chromium.org" not in url_string:
        return []
    try:
        raw = net.get_raw_resource(url_string)
    except:
        print "got exception for: " + url_string
        return []
    hashes = re.findall(r"Committed: <a href=\S+/([0-9a-f]{40})\">", raw)
    return _get_queries_for_hash_url(url_string, hashes)
Example #4
0
def extract_from_github_issue(url_string):
    queries = []
    if re.match(github_issue_re, url_string):
        try:
            raw = net.get_raw_resource(url_string, auth=None)
        except:
            return []
        hashes = re.findall(github_commit_re, raw)
        hashes += re.findall(github_commit_relative_re, raw)
        queries = _get_queries_for_hash_url(url_string, hashes)
    return queries
Example #5
0
def extract_from_moodle(url_string):
    if "git.moodle.org" not in url_string or "MDL" not in url_string:
        # Links without "MDL" and with "commit" are processed in
        # extract_from_commit_urls.
        return []
    try:
        raw = net.get_raw_resource(url_string)
    except:
        return []
    hashes = re.findall(r"<a href=.+?h=([0-9a-f]{40})\">commit</a>", raw)
    return _get_queries_for_hash_url(url_string, hashes)
Example #6
0
 def worker(temp):
     raw = net.get_raw_resource(
         "https://security-tracker.debian.org/tracker/{0}".format(temp))
     url_strings = [
         x for x in re.findall(href_re, raw) if _should_keep_url(x)
     ]
     queries = [db.InsertQuery(models.NonCVE, hash_id=temp)]
     queries += _get_queries_for_noncve_url(temp, url_strings,
                                            ["DebianFake"])
     bar.update()
     return queries
Example #7
0
 def worker(url_string):
     raw = net.get_raw_resource(url_string)
     results = re.findall(
         r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL)
     results = [r for r in results if _should_keep_url(r[1])]
     queries = [
         db.InsertQuery(models.CVE, cve_string=r[0]) for r in results
     ]
     queries += [db.InsertQuery(models.URL, url=r[1]) for r in results]
     queries += [
         db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results
     ]
     bar.update()
     return queries
Example #8
0
def update_cwe_descriptions():
    url = "https://nvd.nist.gov/vuln/categories"

    cwes = db.global_session.query(
        models.CWE).filter(models.CWE.description == None).all()
    if not cwes:
        return

    raw = net.get_raw_resource(url)
    regex = r"<span.+?>CWE-(\d+)</span>.+?<a href.+?>(.+?)</a>"
    descriptions = {x[0]: x[1] for x in re.findall(regex, raw, re.DOTALL)}
    for cwe in cwes:
        description = descriptions.get(cwe.cwe_string[4:])
        cwe.description = description

    db.global_session.commit()
Example #9
0
def extract_from_github_pull(url_string):
    queries = []
    if re.match(github_pull_re, url_string):
        commit_match = re.match(r".+/commits?/([0-9a-f]+)/?$", url_string)
        if commit_match:
            return _get_queries_for_hash_url(url_string,
                                             [commit_match.group(1)])
        normalized_url = re.sub(r"/?files.+$", "", url_string)
        try:
            raw = net.get_raw_resource(normalized_url + "/commits/", auth=None)
        except:
            return []
        hashes = re.findall(github_commit_re, raw)
        hashes += re.findall(github_commit_relative_re, raw)
        queries = _get_queries_for_hash_url(url_string, hashes)
    return queries
Example #10
0
def extract_from_apache_svn(url_string):
    # TODO: search other svn urls.
    if not re.match(svn_apache_re, url_string):
        return []
    db_queries = []
    query_t = "org:apache {0}"
    params = urlparse.parse_qs(urlparse.urlparse(url_string).query)

    # Search by revision id.
    r1 = r2 = revision_id = None
    for k, v in params.iteritems():
        if k in ["rev", "revision"]:
            revision_id = v[0]
        elif k == "r1":
            r1 = v[0]
        elif k == "r2":
            r2 = v[0]
    if not revision_id:
        if r1 and r2:
            revision_id = max(r1, r2)
        else:
            match = re.match(svn_apache_revision_re, url_string)
            if match:
                revision_id = match.group(1)
    if revision_id:
        query_string = query_t.format(revision_id)
        db_queries += _get_queries_for_search_url(url_string, query_string)

    # Search by commit message.
    try:
        raw = net.get_raw_resource(url_string)
        messages = re.findall(svn_apache_message_re, raw)
        if messages:
            query_string = query_t.format(messages[0])
            db_queries += _get_queries_for_search_url(url_string, query_string)
    except Exception as e:
        print "got exception for: {0}".format(url_string)

    return db_queries
Example #11
0
def crawl_android_security_bulletin(parallel=True):
    raw = net.get_raw_resource("https://source.android.com/security/bulletin/")
    urls = re.findall(
        r"https?://source.android.com/security/bulletin/[0-9-]{10}", raw)
    urls = misc.unique(urls)
    bar = misc.KnownLengthBar(maxval=len(urls), parallel=parallel)

    def worker(url_string):
        raw = net.get_raw_resource(url_string)
        results = re.findall(
            r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL)
        results = [r for r in results if _should_keep_url(r[1])]
        queries = [
            db.InsertQuery(models.CVE, cve_string=r[0]) for r in results
        ]
        queries += [db.InsertQuery(models.URL, url=r[1]) for r in results]
        queries += [
            db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results
        ]
        bar.update()
        return queries

    return _process_queries_from_workers(worker, urls, parallel, bar)
Example #12
0
def crawl_debian_fake_names(parallel=True):
    # Works faster with parallelism.
    main_page = net.get_raw_resource(
        "https://security-tracker.debian.org/tracker/data/fake-names")
    temp_re = re.compile(r"/tracker/(TEMP-[0-9A-F-]+)")
    href_re = re.compile(r'href="(\S+?)"')

    temps = re.findall(temp_re, main_page)
    bar = misc.KnownLengthBar(maxval=len(temps), parallel=parallel)

    def worker(temp):
        raw = net.get_raw_resource(
            "https://security-tracker.debian.org/tracker/{0}".format(temp))
        url_strings = [
            x for x in re.findall(href_re, raw) if _should_keep_url(x)
        ]
        queries = [db.InsertQuery(models.NonCVE, hash_id=temp)]
        queries += _get_queries_for_noncve_url(temp, url_strings,
                                               ["DebianFake"])
        bar.update()
        return queries

    return _process_queries_from_workers(worker, temps, parallel, bar)
Example #13
0
    def file_func(file_obj):
        name = basename(file_obj["path"])
        if local:
            raw = fs.read_file(file_obj["path"])
        else:
            raw = net.get_raw_resource(file_obj["download_url"],
                                       auth=net.github_auth)
        url_strings = misc.unique(re.findall(commit_re, raw))
        queries = []
        if url_strings:
            # Figure out the tag list.
            tags = ["vulndb"]
            for k, v in tag_dict.iteritems():
                if k in file_obj["path"]:
                    tags.append(v)
                    break

            # Insert CVEs/NonCVEs and connect them to urls.
            cve_strings = misc.unique(re.findall(cve_re, raw))
            if not cve_strings:
                hash_id = models.NonCVE.hash_id_for_urls(url_strings)
                queries = [db.InsertQuery(models.NonCVE, hash_id=hash_id)]
                queries += _get_queries_for_noncve_url(hash_id, url_strings,
                                                       tags)
            else:
                # Surpisingly there's some CVEs in this db, which are marked
                # as reserved in the nist feed. We need to add them here.
                queries = [
                    db.InsertQuery(models.CVE, cve_string=cve_string)
                    for cve_string in cve_strings
                ]
                for cve_string in cve_strings:
                    queries += _get_queries_for_cve_url(
                        cve_string, url_strings, tags)
        bar.update()
        return ([], queries)
Example #14
0
 def file_func(file_obj):
     name, ext = splitext(basename(file_obj["path"]))
     queries = []
     if ext == ".patch" and cve_index.get(name):
         if local:
             raw = fs.read_file(file_obj["path"])
         else:
             raw = net.get_raw_resource(file_obj["download_url"],
                                        auth=net.github_auth)
         hashes = re.findall(hash_re, raw)
         for h in hashes:
             api_url = api_url_t.format(h)
             http_url = http_url_t.format(h)
             try:
                 net.get_json_resource(api_url, auth=net.github_auth)
             except:
                 # not upstream commit. Shit happens
                 # print "bad response for resource: " + api_url
                 pass
             else:
                 queries = _get_queries_for_cve_url(
                     name, [http_url], ["C", "Linux", "github"])
     bar.update()
     return ([], queries)