Beispiel #1
0
    def items_handler(cve_items):
        get_cve = lambda item: item["cve"]["CVE_data_meta"]["ID"]
        get_urls = lambda item: item["cve"]["references"]["reference_data"]

        def get_cwe_strings(item):
            return [
                desc["value"]
                for data in item["cve"]["problemtype"]["problemtype_data"]
                for desc in data["description"]
            ]

        queries = []
        for item in cve_items:
            queries.append(db.InsertQuery(models.CVE,
                                          cve_string=get_cve(item)))

            for url_string in filter(_should_keep_url,
                                     [url["url"] for url in get_urls(item)]):
                queries.append(db.InsertQuery(models.URL, url=url_string))
                queries.append(
                    db.ConnectQuery(models.cve_url_table, get_cve(item),
                                    url_string))

            for cwe_string in get_cwe_strings(item):
                queries.append(
                    db.InsertQuery(models.CWE, cwe_string=cwe_string))
                queries.append(
                    db.ConnectQuery(models.cve_cwe_table, get_cve(item),
                                    cwe_string))

        return queries
Beispiel #2
0
def _do_github_search_query(search_query):
    logger = logging.getLogger("github_search")

    def remove_hrefs(s):
        s = re.sub(r"<a href.+?>", "", s)
        s = s.replace("</a>", "")
        return s

    mutations = [
        lambda x: [x], lambda x: [remove_hrefs(x)],
        lambda x: remove_hrefs(x).split("\n")
    ]
    mutants = misc.unique(
        misc.flatten_list([m(search_query.query) for m in mutations]))
    for query_str in mutants:
        if not query_str:
            continue
        logger.info("trying {0}".format(query_str))
        code, answer = net.github_search(query_str)
        if code == net.CODE_TIMEOUT:
            logger.info("sleeping...")
            mutants.append(query_str)  # Try again.
            time.sleep(60)
        elif code == net.CODE_VALIDATION:
            logger.info("got 422: " + answer)
            search_query.state = models.GithubSearchQuery.ERROR
            db.global_session.commit()
        elif code == net.CODE_OK:
            if len(answer["items"]) > 5:
                answer["items"] = [
                    item for item in answer["items"] if _messages_match(
                        search_query.query, item["commit"]["message"])
                ]
            hash_strings = misc.unique(
                [item["sha"] for item in answer["items"]])
            logger.info("got results: {0}".format(hash_strings))
            queries = []
            if hash_strings:
                search_query.state = models.GithubSearchQuery.NON_EMPTY
                for h in hash_strings:
                    queries += [
                        db.InsertQuery(models.CommitHash, hash=h),
                        db.ConnectQuery(models.query_hash_table,
                                        search_query.query, h)
                    ]
                    queries += [
                        db.ConnectQuery(models.hash_url_table, h, url.url)
                        for url in search_query.urls
                    ]
                db.process_queries(queries)
                db.global_session.commit()  # Commit state update.
                return
            search_query.state = models.GithubSearchQuery.EMPTY
            db.global_session.commit()  # Commit state update.
        else:
            raise "got something unexpected: {0} {1}".format(code, answer)
Beispiel #3
0
def _get_queries_for_search_url(url_string, search_query):
    search_query = search_query[:config.MAX_GITHUB_QUERY_LEN]
    queries = [
        db.InsertQuery(models.GithubSearchQuery, query=search_query),
        db.ConnectQuery(models.query_url_table, search_query, url_string)
    ]
    return queries
Beispiel #4
0
def _get_queries_for_hash_url(url_string, hashes):
    hashes = misc.unique(hashes)
    if config.IGNORE_SHORT_HASHES:
        hashes = [h for h in hashes if len(h) == 40]
    queries = \
        [db.InsertQuery(models.CommitHash, hash=h) for h in hashes] +\
        [db.ConnectQuery(models.hash_url_table, h, url_string) for h in hashes]
    return queries
Beispiel #5
0
def _get_queries_for_noncve_url(hash_id, url_strings, tags=[]):
    queries = \
        [db.InsertQuery(models.URL, url=url_string)
            for url_string in url_strings
        ] + \
        [db.ConnectQuery(models.non_cve_url_table, hash_id, url_string)
            for url_string in url_strings
        ]
    if url_strings and tags:
        queries += [db.UpdateTagQuery(models.NonCVE, hash_id, tags)]
    return queries
Beispiel #6
0
def _get_queries_for_cve_url(cve_string, url_strings, tags=[]):
    queries = \
        [db.InsertQuery(models.URL, url=url_string)
            for url_string in url_strings
        ] + \
        [db.ConnectQuery(models.cve_url_table, cve_string, url_string)
            for url_string in url_strings
        ]
    if url_strings and tags:
        queries += [db.UpdateTagQuery(models.CVE, cve_string, tags)]
    return queries
Beispiel #7
0
 def worker(url_string):
     raw = net.get_raw_resource(url_string)
     results = re.findall(
         r"<td>(CVE[0-9-]+)</td>\s+<td><a href=\"(\S+?)\">", raw, re.DOTALL)
     results = [r for r in results if _should_keep_url(r[1])]
     queries = [
         db.InsertQuery(models.CVE, cve_string=r[0]) for r in results
     ]
     queries += [db.InsertQuery(models.URL, url=r[1]) for r in results]
     queries += [
         db.ConnectQuery(models.cve_url_table, r[0], r[1]) for r in results
     ]
     bar.update()
     return queries
Beispiel #8
0
def crawl_jenkins():
    url_t = "https://github.com/jenkinsci/jenkins/commit/{0}"
    cve_t = "JENKINS-{0}"
    commits = misc.get_repo_log("jenkinsci", "jenkins")
    hashes = [c["hash"] for c in commits if "SECURITY" in c["message"]]
    url_strings = [url_t.format(h) for h in hashes]
    queries = []
    for i, url_string in enumerate(url_strings):
        hash_id = cve_t.format(i)
        queries += [
            db.InsertQuery(models.NonCVE, hash_id=hash_id),
            db.InsertQuery(models.URL, url=url_string),
            db.ConnectQuery(models.non_cve_url_table, hash_id, url_string),
            db.UpdateTagQuery(models.NonCVE, hash_id, ["Jenkins", "Java"])
        ]
    db.process_queries(queries)