def retry(self, f): for t in range(self.maxtries): try: self._create_conn() return f() except MySQLdb._exceptions.OperationalError as e: last_exception = e try: self._close_conn() except Exception as e2: log_error("Suppressed exception: {}".format(str(e2)), 3) if t + 1 == self.maxtries: log_error( "MySQL connection eventually failed, closing connection!", 3) raise last_exception else: factor = 0.2 logmsg = ("Exception ({}) on mysql connection attempt " "{} of {}, wating {}s +/- {}% before retrying..." ).format(str(e), t + 1, self.maxtries, self.try_wait, factor * 100) log_warning(logmsg, 3) time.sleep(self.try_wait * uniform(1 - factor, 1 + factor))
def parse_and_insert_replies(ext_id, date, repliespath, con): log_debug("- parsing reply file", 3) with open(repliespath) as f: d = json.load(f) if "searchResults" not in d: log_warning("* WARNING: there are no search results in {}".format(repliespath), 3) return for result in d["searchResults"]: if "annotations" not in result: continue for annotation in result["annotations"]: comment = get(annotation, "comment") if comment is not None: commentmd5 = hashlib.md5(comment.encode()).digest() con.insert( "reply", extid=ext_id, date=convert_date(date), commentdate=datetime.datetime.utcfromtimestamp( get(annotation, "timestamp")).isoformat() if "timestamp" in annotation else None, replyto=get( get(get(annotation, "entity"), "annotation"), "author"), commentmd5=commentmd5, displayname=get( get(annotation, "entity"), "displayName"), author=get(get(annotation, "entity"), "author"), language=get(annotation, "language"), shortauthor=get( get(annotation, "entity"), "shortAuthor")) con.insert( "reply_comment", commentmd5=commentmd5, comment=comment)
def get_etag(ext_id, datepath, con): # Trying to parse etag file etagpath = next( iter(glob.glob(os.path.join(datepath, "*.crx.etag"))), None) if etagpath: with open(etagpath) as f: return f.read() # Trying to parse header file for etag headerpath = next( iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None) if headerpath: with open(headerpath) as f: content = f.read() try: headers = ast.literal_eval(content) if "ETag" in headers: return headers["ETag"] except Exception: log_warning("* WARNING: could not parse crx header file", 3) # Trying to look up previous etag in database linkpath = next( iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None) if linkpath: with open(linkpath) as f: link = f.read() linked_date = link[3:].split("/")[0] result = con.get_etag(ext_id, convert_date(linked_date)) if result is not None: return result return None
def update_db_incremental_with_connection(tmptardir, ext_id, date, con): log_info("* Updating db with data from from {}".format(date), 2) datepath = os.path.join(tmptardir, date) etag = get_etag(ext_id, datepath, con) if etag: try: parse_and_insert_crx(ext_id, datepath, con) except Exception: log_exception("Exception when parsing crx", 3) else: crx_status = get_crx_status(datepath) if crx_status != 401 and crx_status != 204 and crx_status != 404: log_warning("* WARNING: could not find etag", 3) try: parse_and_insert_overview(ext_id, date, datepath, con) except Exception: log_exception("Exception when parsing overview", 3) try: parse_and_insert_status(ext_id, date, datepath, con) except Exception: log_exception("Exception when parsing status", 3) reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text")) for reviewpath in reviewpaths: try: parse_and_insert_review(ext_id, date, reviewpath, con) except json.decoder.JSONDecodeError: log_warning("- WARNING: Review is not a proper json file!", 3) except Exception: log_exception("Exception when parsing review", 3) supportpaths = glob.glob(os.path.join(datepath, "support*-*.text")) for supportpath in supportpaths: try: parse_and_insert_support(ext_id, date, supportpath, con) except json.decoder.JSONDecodeError: log_warning("- WARNING: Support is not a proper json file!", 3) except Exception: log_exception("Exception when parsing support", 3) repliespaths = glob.glob(os.path.join(datepath, "*replies.text")) for repliespath in repliespaths: try: parse_and_insert_replies(ext_id, date, repliespath, con) except json.decoder.JSONDecodeError: log_warning("- WARNING: Reply is not a proper json file!", 3) except Exception: log_exception("Exception when parsing reply", 3)
def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck): ext_with_forums = list(set(forums_ext_ids)) ext_without_forums = list(set(ext_ids) - set(forums_ext_ids)) tups = [(ext_id, True) for ext_id in ext_with_forums] + [(ext_id, False) for ext_id in ext_without_forums] random.shuffle(tups) log_info("Updating {} extensions ({} including forums, {} excluding forums)".format(len(tups), len(ext_with_forums), len(ext_without_forums))) with MysqlProcessBackend( None, read_default_file=const_mysql_config_file(), charset='utf8mb4') as con: results = [] with ProcessPool(max_workers=parallel, initializer=init_process, initargs=(verbose, start_pystuck, RequestManager(parallel))) as pool: future = pool.map(update_extension, [(archivedir, con, extid, archive) for extid, archive in tups], chunksize=1, timeout=timeout) iterator = future.result() for ext_id in ext_ids: try: results.append(next(iterator)) except StopIteration: break except TimeoutError as error: log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1])) results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error)) except ProcessExpired as error: log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode)) results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error)) except Exception as error: log_warning("WorkerException: Processing %s raised %s" % (ext_id, error)) log_warning(error.traceback) # Python's traceback of remote process results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error)) return results
def parse_and_insert_crx(ext_id, datepath, con): crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None) if not crx_path: return if os.path.getsize(crx_path) == 0: log_warning("- WARNING: crx file has size 0!", 3) return log_debug("- parsing crx file", 3) filename = os.path.basename(crx_path) with ZipFile(crx_path) as f: etag = get_etag(ext_id, datepath, con) size = os.path.getsize(crx_path) public_key = read_crx(crx_path).public_key with f.open("manifest.json") as m: raw_content = m.read() # There are some manifests that seem to have weird encodings... try: content = raw_content.decode("utf-8-sig") except UnicodeDecodeError: # Trying a different encoding, manifests are weird... content = raw_content.decode("latin1") con.insert( "crx", crx_etag=etag, filename=filename, size=size, manifest=content, publickey=public_key) manifest = json.loads(jsmin(content), strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: con.insert( "permission", crx_etag=etag, permission_md5=hashlib.md5( str(permission).encode()).digest(), permission=str(permission)) if "content_scripts" in manifest: for csd in manifest["content_scripts"]: if "matches" in csd: for urlpattern in csd["matches"]: con.insert( "content_script_url", crx_etag=etag, url_md5=hashlib.md5( str(urlpattern).encode()).digest(), url=str(urlpattern)) js_files = decompose_js_with_connection(f, con) for file_info in js_files: for prefix, typ in [("", "AS_IS"), ("normalized_", "NORMALIZED"), ("dec_", "DECOMPRESSED"), ("dec_normalized_", "DECOMPRESSED_NORMALIZED")]: if file_info[prefix + "md5"] is not None: con.insert( "crxfile", crx_etag=etag, path=file_info['path'], filename=file_info['filename'], mimetype=file_info["mimetype"][0], mimetype_detail=file_info["mimetype"][1], simhash=file_info["simhash"], md5=file_info[prefix + "md5"], sha1=file_info[prefix + "sha1"], sha256=file_info[prefix + "sha256"], typ=typ) con.insert( "libdet", md5=file_info[prefix + "md5"], sha1=file_info[prefix + "sha1"], sha256=file_info[prefix + "sha256"], size=file_info[prefix + "size"], loc=file_info[prefix + "loc"], description=file_info[prefix + "description"], encoding=file_info[prefix + "encoding"], mimetype_magic=file_info[prefix + "mimetype_magic"], library=file_info["lib"], version=file_info["version"], typ=typ, classification_type=file_info['type'].value, detect_method=file_info['detectionMethod'].value, detect_method_details=file_info[ 'detectionMethodDetails'], evidence_start_pos=file_info['evidenceStartPos'], evidence_end_pos=file_info['evidenceEndPos'], evidence_text=file_info['evidenceText'])