Esempio n. 1
0
    def retry(self, f):
        for t in range(self.maxtries):
            try:
                self._create_conn()
                return f()
            except MySQLdb._exceptions.OperationalError as e:
                last_exception = e

                try:
                    self._close_conn()
                except Exception as e2:
                    log_error("Suppressed exception: {}".format(str(e2)), 3)

                if t + 1 == self.maxtries:
                    log_error(
                        "MySQL connection eventually failed, closing connection!",
                        3)
                    raise last_exception
                else:
                    factor = 0.2
                    logmsg = ("Exception ({}) on mysql connection attempt "
                              "{} of {}, wating {}s +/- {}% before retrying..."
                              ).format(str(e), t + 1, self.maxtries,
                                       self.try_wait, factor * 100)
                    log_warning(logmsg, 3)
                    time.sleep(self.try_wait * uniform(1 - factor, 1 + factor))
Esempio n. 2
0
def parse_and_insert_replies(ext_id, date, repliespath, con):
    log_debug("- parsing reply file", 3)
    with open(repliespath) as f:
        d = json.load(f)
        if "searchResults" not in d:
            log_warning("* WARNING: there are no search results in {}".format(repliespath), 3)
            return
        for result in d["searchResults"]:
            if "annotations" not in result:
                continue
            for annotation in result["annotations"]:
                comment = get(annotation, "comment")
                if comment is not None:
                    commentmd5 = hashlib.md5(comment.encode()).digest()
                    con.insert(
                        "reply",
                        extid=ext_id,
                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(annotation, "timestamp")).isoformat()
                        if "timestamp" in annotation else None,
                        replyto=get(
                            get(get(annotation, "entity"), "annotation"),
                            "author"),
                        commentmd5=commentmd5,
                        displayname=get(
                            get(annotation, "entity"), "displayName"),
                        author=get(get(annotation, "entity"), "author"),
                        language=get(annotation, "language"),
                        shortauthor=get(
                            get(annotation, "entity"), "shortAuthor"))
                    con.insert(
                        "reply_comment",
                        commentmd5=commentmd5,
                        comment=comment)
Esempio n. 3
0
def get_etag(ext_id, datepath, con):
    # Trying to parse etag file
    etagpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.etag"))), None)
    if etagpath:
        with open(etagpath) as f:
            return f.read()

    # Trying to parse header file for etag
    headerpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
    if headerpath:
        with open(headerpath) as f:
            content = f.read()
            try:
                headers = ast.literal_eval(content)
                if "ETag" in headers:
                    return headers["ETag"]
            except Exception:
                log_warning("* WARNING: could not parse crx header file", 3)

    # Trying to look up previous etag in database
    linkpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
    if linkpath:
        with open(linkpath) as f:
            link = f.read()
            linked_date = link[3:].split("/")[0]

            result = con.get_etag(ext_id, convert_date(linked_date))
            if result is not None:
                return result

    return None
Esempio n. 4
0
def update_db_incremental_with_connection(tmptardir, ext_id, date, con):
    log_info("* Updating db with data from from {}".format(date), 2)
    datepath = os.path.join(tmptardir, date)

    etag = get_etag(ext_id, datepath, con)

    if etag:
        try:
            parse_and_insert_crx(ext_id, datepath, con)
        except Exception:
            log_exception("Exception when parsing crx", 3)
    else:
        crx_status = get_crx_status(datepath)
        if crx_status != 401 and crx_status != 204 and crx_status != 404:
            log_warning("* WARNING: could not find etag", 3)

    try:
        parse_and_insert_overview(ext_id, date, datepath, con)
    except Exception:
        log_exception("Exception when parsing overview", 3)

    try:
        parse_and_insert_status(ext_id, date, datepath, con)
    except Exception:
        log_exception("Exception when parsing status", 3)

    reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
    for reviewpath in reviewpaths:
        try:
            parse_and_insert_review(ext_id, date, reviewpath, con)
        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Review is not a proper json file!", 3)
        except Exception:
            log_exception("Exception when parsing review", 3)

    supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
    for supportpath in supportpaths:
        try:
            parse_and_insert_support(ext_id, date, supportpath, con)
        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Support is not a proper json file!", 3)
        except Exception:
            log_exception("Exception when parsing support", 3)

    repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
    for repliespath in repliespaths:
        try:
            parse_and_insert_replies(ext_id, date, repliespath, con)
        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Reply is not a proper json file!", 3)
        except Exception:
            log_exception("Exception when parsing reply", 3)
Esempio n. 5
0
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck):
    ext_with_forums = list(set(forums_ext_ids))
    ext_without_forums = list(set(ext_ids) - set(forums_ext_ids))

    tups = [(ext_id, True) for ext_id in ext_with_forums] + [(ext_id, False) for ext_id in ext_without_forums]
    random.shuffle(tups)

    log_info("Updating {} extensions ({} including forums, {} excluding forums)".format(len(tups), len(ext_with_forums),
        len(ext_without_forums)))

    with MysqlProcessBackend(
            None,
            read_default_file=const_mysql_config_file(),
            charset='utf8mb4') as con:
        results = []
        with ProcessPool(max_workers=parallel, initializer=init_process,
                         initargs=(verbose, start_pystuck, RequestManager(parallel))) as pool:
            future = pool.map(update_extension, [(archivedir, con, extid, archive) for extid, archive in tups], chunksize=1, timeout=timeout)
            iterator = future.result()
            for ext_id in ext_ids:
                try:
                    results.append(next(iterator))
                except StopIteration:
                    break
                except TimeoutError as error:
                    log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1]))
                    results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
                except ProcessExpired as error:
                    log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode))
                    results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
                except Exception as error:
                    log_warning("WorkerException: Processing %s raised %s" % (ext_id, error))
                    log_warning(error.traceback)  # Python's traceback of remote process
                    results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))

    return results
Esempio n. 6
0
def parse_and_insert_crx(ext_id, datepath, con):
    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
    if not crx_path:
        return

    if os.path.getsize(crx_path) == 0:
        log_warning("- WARNING: crx file has size 0!", 3)
        return

    log_debug("- parsing crx file", 3)
    filename = os.path.basename(crx_path)

    with ZipFile(crx_path) as f:
        etag = get_etag(ext_id, datepath, con)

        size = os.path.getsize(crx_path)
        public_key = read_crx(crx_path).public_key

        with f.open("manifest.json") as m:
            raw_content = m.read()
            # There are some manifests that seem to have weird encodings...
            try:
                content = raw_content.decode("utf-8-sig")
            except UnicodeDecodeError:
                # Trying a different encoding, manifests are weird...
                content = raw_content.decode("latin1")

            con.insert(
                "crx",
                crx_etag=etag,
                filename=filename,
                size=size,
                manifest=content,
                publickey=public_key)

            manifest = json.loads(jsmin(content), strict=False)
            if "permissions" in manifest:
                for permission in manifest["permissions"]:
                    con.insert(
                        "permission",
                        crx_etag=etag,
                        permission_md5=hashlib.md5(
                            str(permission).encode()).digest(),
                        permission=str(permission))
            if "content_scripts" in manifest:
                for csd in manifest["content_scripts"]:
                    if "matches" in csd:
                        for urlpattern in csd["matches"]:
                            con.insert(
                                "content_script_url",
                                crx_etag=etag,
                                url_md5=hashlib.md5(
                                    str(urlpattern).encode()).digest(),
                                url=str(urlpattern))

        js_files = decompose_js_with_connection(f, con)
        for file_info in js_files:
            for prefix, typ in [("", "AS_IS"), ("normalized_", "NORMALIZED"),
                                ("dec_",
                                 "DECOMPRESSED"), ("dec_normalized_",
                                                   "DECOMPRESSED_NORMALIZED")]:
                if file_info[prefix + "md5"] is not None:
                    con.insert(
                        "crxfile",
                        crx_etag=etag,
                        path=file_info['path'],
                        filename=file_info['filename'],
                        mimetype=file_info["mimetype"][0],
                        mimetype_detail=file_info["mimetype"][1],
                        simhash=file_info["simhash"],
                        md5=file_info[prefix + "md5"],
                        sha1=file_info[prefix + "sha1"],
                        sha256=file_info[prefix + "sha256"],
                        typ=typ)
                    con.insert(
                        "libdet",
                        md5=file_info[prefix + "md5"],
                        sha1=file_info[prefix + "sha1"],
                        sha256=file_info[prefix + "sha256"],
                        size=file_info[prefix + "size"],
                        loc=file_info[prefix + "loc"],
                        description=file_info[prefix + "description"],
                        encoding=file_info[prefix + "encoding"],
                        mimetype_magic=file_info[prefix + "mimetype_magic"],
                        library=file_info["lib"],
                        version=file_info["version"],
                        typ=typ,
                        classification_type=file_info['type'].value,
                        detect_method=file_info['detectionMethod'].value,
                        detect_method_details=file_info[
                            'detectionMethodDetails'],
                        evidence_start_pos=file_info['evidenceStartPos'],
                        evidence_end_pos=file_info['evidenceEndPos'],
                        evidence_text=file_info['evidenceText'])