Example #1
0
def parse_and_insert_support(ext_id, date, supportpath, con):
    log_debug("- parsing support file", 3)
    with open(supportpath) as f:
        content = f.read()
        stripped = content[content.find('{"'):]
        d = json.JSONDecoder().raw_decode(stripped)
        annotations = get(next(iter(d), None), "annotations")
        if annotations:
            for review in d[0]["annotations"]:
                comment = get(review, "comment")
                if comment is not None:
                    commentmd5 = hashlib.md5(comment.encode()).digest()
                    con.insert(
                        "support",
                        extid=ext_id,
                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(review, "timestamp")).isoformat()
                        if "timestamp" in review else None,
                        title=get(review, "title"),
                        commentmd5=commentmd5,
                        displayname=get(get(review, "entity"), "displayName"),
                        author=get(get(review, "entity"), "author"),
                        language=get(review, "language"),
                        shortauthor=get(get(review, "entity"), "shortAuthor"))
                    con.insert(
                        "support_comment",
                        comment=comment,
                        commentmd5=commentmd5)
Example #2
0
def parse_and_insert_replies(ext_id, date, repliespath, con):
    log_debug("- parsing reply file", 3)
    with open(repliespath) as f:
        d = json.load(f)
        if "searchResults" not in d:
            log_warning("* WARNING: there are no search results in {}".format(repliespath), 3)
            return
        for result in d["searchResults"]:
            if "annotations" not in result:
                continue
            for annotation in result["annotations"]:
                comment = get(annotation, "comment")
                if comment is not None:
                    commentmd5 = hashlib.md5(comment.encode()).digest()
                    con.insert(
                        "reply",
                        extid=ext_id,
                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(annotation, "timestamp")).isoformat()
                        if "timestamp" in annotation else None,
                        replyto=get(
                            get(get(annotation, "entity"), "annotation"),
                            "author"),
                        commentmd5=commentmd5,
                        displayname=get(
                            get(annotation, "entity"), "displayName"),
                        author=get(get(annotation, "entity"), "author"),
                        language=get(annotation, "language"),
                        shortauthor=get(
                            get(annotation, "entity"), "shortAuthor"))
                    con.insert(
                        "reply_comment",
                        commentmd5=commentmd5,
                        comment=comment)
Example #3
0
def parse_and_insert_status(ext_id, date, datepath, con):
    log_debug("- parsing status file", 3)
    overview_status = get_overview_status(datepath)
    crx_status = get_crx_status(datepath)

    overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
    overview_exception = None
    if os.path.exists(overviewexceptionpath):
        with open(overviewexceptionpath) as f:
            overview_exception = f.read()

    con.insert(
        "status",
        extid=ext_id,
        date=convert_date(date),
        crx_status=crx_status,
        overview_status=overview_status,
        overview_exception=overview_exception)
Example #4
0
def parse_and_insert_overview(ext_id, date, datepath, con):
    log_debug("- parsing overview file", 3)
    overview_path = os.path.join(datepath, "overview.html")
    if os.path.exists(overview_path):
        with open(overview_path) as overview_file:
            contents = overview_file.read()

            # Extract extension name
            match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
                              contents)
            name = match.group(1) if match else None

            # Extract extension version
            match = re.search(
                """<meta itemprop="version" content="(.*?)"\s*/>""", contents)
            version = match.group(1) if match else None

            match = re.search(
                """<meta itemprop="ratingValue" content="(.*?)"\s*/>""",
                contents)
            rating = float(match.group(1)) if match else None

            match = re.search(
                """<meta itemprop="ratingCount" content="(.*?)"\s*/>""",
                contents)
            rating_count = int(match.group(1)) if match else None

            # Extracts extension categories
            match = re.search(
                """Attribute name="category">(.+?)</Attribute>""", contents)
            categories = match.group(1).split(",") if match else None

            # Extracts the number of downloads
            match = re.search(
                """<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
                contents)
            downloads = int(match.group(1).replace(",", '')) if match else None

            # Extracts the full extension description as it appears on the
            # overview page
            doc = BeautifulSoup(contents, 'html.parser')

            description_parent = doc.find('div', itemprop="description")
            description = str(
                description_parent.contents[0]
            ) if description_parent and description_parent.contents else None
            full_description = str(
                description_parent.parent) if description_parent else None

            offeredby_parent = doc.find(
                class_=lambda cls: cls and "e-f-Me" in cls)
            offeredby = "".join([str(x) for x in offeredby_parent.contents
                                 ]) if offeredby_parent else None

            developer_parent = doc.find(
                class_=lambda cls: cls and "C-b-p-rc-D-J" in cls)
            developer = "".join([str(x) for x in developer_parent.contents
                                 ]) if developer_parent else None

            last_updated_parent = doc.find(
                class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
            last_updated = str(last_updated_parent.contents[
                0]) if last_updated_parent else None

            etag = get_etag(ext_id, datepath, con)

            match = re.search(
                """<Attribute name="item_category">(.*?)</Attribute>""",
                contents)
            itemcategory = match.group(1) if match else None


            con.insert(
                "extension",
                extid=ext_id,
                date=convert_date(date),
                name=name,
                version=version,
                description=description,
                downloads=downloads,
                rating=rating,
                ratingcount=rating_count,
                fulldescription=full_description,
                offeredby=offeredby,
                developer=developer,
                itemcategory=itemcategory,
                crx_etag=etag,
                lastupdated=last_updated)

            if categories:
                for category in categories:
                    con.insert(
                        "category",
                        extid=ext_id,
                        date=convert_date(date),
                        category_md5=hashlib.md5(category.encode()).digest(),
                        category=category)
Example #5
0
def parse_and_insert_crx(ext_id, datepath, con):
    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
    if not crx_path:
        return

    if os.path.getsize(crx_path) == 0:
        log_warning("- WARNING: crx file has size 0!", 3)
        return

    log_debug("- parsing crx file", 3)
    filename = os.path.basename(crx_path)

    with ZipFile(crx_path) as f:
        etag = get_etag(ext_id, datepath, con)

        size = os.path.getsize(crx_path)
        public_key = read_crx(crx_path).public_key

        with f.open("manifest.json") as m:
            raw_content = m.read()
            # There are some manifests that seem to have weird encodings...
            try:
                content = raw_content.decode("utf-8-sig")
            except UnicodeDecodeError:
                # Trying a different encoding, manifests are weird...
                content = raw_content.decode("latin1")

            con.insert(
                "crx",
                crx_etag=etag,
                filename=filename,
                size=size,
                manifest=content,
                publickey=public_key)

            manifest = json.loads(jsmin(content), strict=False)
            if "permissions" in manifest:
                for permission in manifest["permissions"]:
                    con.insert(
                        "permission",
                        crx_etag=etag,
                        permission_md5=hashlib.md5(
                            str(permission).encode()).digest(),
                        permission=str(permission))
            if "content_scripts" in manifest:
                for csd in manifest["content_scripts"]:
                    if "matches" in csd:
                        for urlpattern in csd["matches"]:
                            con.insert(
                                "content_script_url",
                                crx_etag=etag,
                                url_md5=hashlib.md5(
                                    str(urlpattern).encode()).digest(),
                                url=str(urlpattern))

        js_files = decompose_js_with_connection(f, con)
        for file_info in js_files:
            for prefix, typ in [("", "AS_IS"), ("normalized_", "NORMALIZED"),
                                ("dec_",
                                 "DECOMPRESSED"), ("dec_normalized_",
                                                   "DECOMPRESSED_NORMALIZED")]:
                if file_info[prefix + "md5"] is not None:
                    con.insert(
                        "crxfile",
                        crx_etag=etag,
                        path=file_info['path'],
                        filename=file_info['filename'],
                        mimetype=file_info["mimetype"][0],
                        mimetype_detail=file_info["mimetype"][1],
                        simhash=file_info["simhash"],
                        md5=file_info[prefix + "md5"],
                        sha1=file_info[prefix + "sha1"],
                        sha256=file_info[prefix + "sha256"],
                        typ=typ)
                    con.insert(
                        "libdet",
                        md5=file_info[prefix + "md5"],
                        sha1=file_info[prefix + "sha1"],
                        sha256=file_info[prefix + "sha256"],
                        size=file_info[prefix + "size"],
                        loc=file_info[prefix + "loc"],
                        description=file_info[prefix + "description"],
                        encoding=file_info[prefix + "encoding"],
                        mimetype_magic=file_info[prefix + "mimetype_magic"],
                        library=file_info["lib"],
                        version=file_info["version"],
                        typ=typ,
                        classification_type=file_info['type'].value,
                        detect_method=file_info['detectionMethod'].value,
                        detect_method_details=file_info[
                            'detectionMethodDetails'],
                        evidence_start_pos=file_info['evidenceStartPos'],
                        evidence_end_pos=file_info['evidenceEndPos'],
                        evidence_text=file_info['evidenceText'])