def parse_and_insert_support(ext_id, date, supportpath, con): log_debug("- parsing support file", 3) with open(supportpath) as f: content = f.read() stripped = content[content.find('{"'):] d = json.JSONDecoder().raw_decode(stripped) annotations = get(next(iter(d), None), "annotations") if annotations: for review in d[0]["annotations"]: comment = get(review, "comment") if comment is not None: commentmd5 = hashlib.md5(comment.encode()).digest() con.insert( "support", extid=ext_id, date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(review, "timestamp")).isoformat() if "timestamp" in review else None, title=get(review, "title"), commentmd5=commentmd5, displayname=get(get(review, "entity"), "displayName"), author=get(get(review, "entity"), "author"), language=get(review, "language"), shortauthor=get(get(review, "entity"), "shortAuthor")) con.insert( "support_comment", comment=comment, commentmd5=commentmd5)
def parse_and_insert_replies(ext_id, date, repliespath, con): log_debug("- parsing reply file", 3) with open(repliespath) as f: d = json.load(f) if "searchResults" not in d: log_warning("* WARNING: there are no search results in {}".format(repliespath), 3) return for result in d["searchResults"]: if "annotations" not in result: continue for annotation in result["annotations"]: comment = get(annotation, "comment") if comment is not None: commentmd5 = hashlib.md5(comment.encode()).digest() con.insert( "reply", extid=ext_id, date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(annotation, "timestamp")).isoformat() if "timestamp" in annotation else None, replyto=get( get(get(annotation, "entity"), "annotation"), "author"), commentmd5=commentmd5, displayname=get( get(annotation, "entity"), "displayName"), author=get(get(annotation, "entity"), "author"), language=get(annotation, "language"), shortauthor=get( get(annotation, "entity"), "shortAuthor")) con.insert( "reply_comment", commentmd5=commentmd5, comment=comment)
def parse_and_insert_status(ext_id, date, datepath, con): log_debug("- parsing status file", 3) overview_status = get_overview_status(datepath) crx_status = get_crx_status(datepath) overviewexceptionpath = os.path.join(datepath, "overview.html.exception") overview_exception = None if os.path.exists(overviewexceptionpath): with open(overviewexceptionpath) as f: overview_exception = f.read() con.insert( "status", extid=ext_id, date=convert_date(date), crx_status=crx_status, overview_status=overview_status, overview_exception=overview_exception)
def parse_and_insert_overview(ext_id, date, datepath, con): log_debug("- parsing overview file", 3) overview_path = os.path.join(datepath, "overview.html") if os.path.exists(overview_path): with open(overview_path) as overview_file: contents = overview_file.read() # Extract extension name match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""", contents) name = match.group(1) if match else None # Extract extension version match = re.search( """<meta itemprop="version" content="(.*?)"\s*/>""", contents) version = match.group(1) if match else None match = re.search( """<meta itemprop="ratingValue" content="(.*?)"\s*/>""", contents) rating = float(match.group(1)) if match else None match = re.search( """<meta itemprop="ratingCount" content="(.*?)"\s*/>""", contents) rating_count = int(match.group(1)) if match else None # Extracts extension categories match = re.search( """Attribute name="category">(.+?)</Attribute>""", contents) categories = match.group(1).split(",") if match else None # Extracts the number of downloads match = re.search( """<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""", contents) downloads = int(match.group(1).replace(",", '')) if match else None # Extracts the full extension description as it appears on the # overview page doc = BeautifulSoup(contents, 'html.parser') description_parent = doc.find('div', itemprop="description") description = str( description_parent.contents[0] ) if description_parent and description_parent.contents else None full_description = str( description_parent.parent) if description_parent else None offeredby_parent = doc.find( class_=lambda cls: cls and "e-f-Me" in cls) offeredby = "".join([str(x) for x in offeredby_parent.contents ]) if offeredby_parent else None developer_parent = doc.find( class_=lambda cls: cls and "C-b-p-rc-D-J" in cls) developer = "".join([str(x) for x in developer_parent.contents ]) if developer_parent else None last_updated_parent = doc.find( class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls) last_updated = str(last_updated_parent.contents[ 0]) if last_updated_parent else None etag = get_etag(ext_id, datepath, con) match = re.search( """<Attribute name="item_category">(.*?)</Attribute>""", contents) itemcategory = match.group(1) if match else None con.insert( "extension", extid=ext_id, date=convert_date(date), name=name, version=version, description=description, downloads=downloads, rating=rating, ratingcount=rating_count, fulldescription=full_description, offeredby=offeredby, developer=developer, itemcategory=itemcategory, crx_etag=etag, lastupdated=last_updated) if categories: for category in categories: con.insert( "category", extid=ext_id, date=convert_date(date), category_md5=hashlib.md5(category.encode()).digest(), category=category)
def parse_and_insert_crx(ext_id, datepath, con): crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) if not crx_path: return if os.path.getsize(crx_path) == 0: log_warning("- WARNING: crx file has size 0!", 3) return log_debug("- parsing crx file", 3) filename = os.path.basename(crx_path) with ZipFile(crx_path) as f: etag = get_etag(ext_id, datepath, con) size = os.path.getsize(crx_path) public_key = read_crx(crx_path).public_key with f.open("manifest.json") as m: raw_content = m.read() # There are some manifests that seem to have weird encodings... try: content = raw_content.decode("utf-8-sig") except UnicodeDecodeError: # Trying a different encoding, manifests are weird... content = raw_content.decode("latin1") con.insert( "crx", crx_etag=etag, filename=filename, size=size, manifest=content, publickey=public_key) manifest = json.loads(jsmin(content), strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: con.insert( "permission", crx_etag=etag, permission_md5=hashlib.md5( str(permission).encode()).digest(), permission=str(permission)) if "content_scripts" in manifest: for csd in manifest["content_scripts"]: if "matches" in csd: for urlpattern in csd["matches"]: con.insert( "content_script_url", crx_etag=etag, url_md5=hashlib.md5( str(urlpattern).encode()).digest(), url=str(urlpattern)) js_files = decompose_js_with_connection(f, con) for file_info in js_files: for prefix, typ in [("", "AS_IS"), ("normalized_", "NORMALIZED"), ("dec_", "DECOMPRESSED"), ("dec_normalized_", "DECOMPRESSED_NORMALIZED")]: if file_info[prefix + "md5"] is not None: con.insert( "crxfile", crx_etag=etag, path=file_info['path'], filename=file_info['filename'], mimetype=file_info["mimetype"][0], mimetype_detail=file_info["mimetype"][1], simhash=file_info["simhash"], md5=file_info[prefix + "md5"], sha1=file_info[prefix + "sha1"], sha256=file_info[prefix + "sha256"], typ=typ) con.insert( "libdet", md5=file_info[prefix + "md5"], sha1=file_info[prefix + "sha1"], sha256=file_info[prefix + "sha256"], size=file_info[prefix + "size"], loc=file_info[prefix + "loc"], description=file_info[prefix + "description"], encoding=file_info[prefix + "encoding"], mimetype_magic=file_info[prefix + "mimetype_magic"], library=file_info["lib"], version=file_info["version"], typ=typ, classification_type=file_info['type'].value, detect_method=file_info['detectionMethod'].value, detect_method_details=file_info[ 'detectionMethodDetails'], evidence_start_pos=file_info['evidenceStartPos'], evidence_end_pos=file_info['evidenceEndPos'], evidence_text=file_info['evidenceText'])