def last_crx(archivedir, extid, date=None): last_crx_path = "" last_crx_etag = "" etag_file = os.path.join(archivedir, get_local_archive_dir(extid), extid + ".etag") if date is None and os.path.exists(etag_file): try: with open(etag_file, 'r') as f: d = json.load(f) return d["last_crx"], d["last_crx_etag"] except Exception: log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file)) try: os.remove(etag_file) except Exception: log_exception("Could not remove etag file {}!".format(etag_file)) # If we do not yet have an .etag file present, open the tarfile and look # there for one. After having done that once, the crawler creates the .etag # file to avoid opening the tar file in the future. tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ".tar") if os.path.exists(tar): with tarfile.open(tar, 'r') as t: old_crxs = sorted([ x.name for x in t.getmembers() if x.name.endswith(".crx") and x.size > 0 and ( date is None or (dateutil.parser.parse( os.path.split(os.path.split(x.name)[0])[1]) <= date)) ]) if old_crxs: last_crx_path = old_crxs[-1] headers_content = t.extractfile( last_crx_path + ".headers").read().decode().replace( '"', '\\"').replace("'", '"') headers_json = json.loads(headers_content) last_crx_etag = headers_json["ETag"] if date is None: with open(etag_file, 'w') as f: json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f) return last_crx_path, last_crx_etag
def all_crx(archivedir, extid, date=None): tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ".tar") all_crxs = [] if os.path.exists(tar): t = tarfile.open(tar, 'r') all_crxs = sorted([ x.name for x in t.getmembers() if x.name.endswith(".crx") and x.size > 0 ]) t.close() return all_crxs
def first_crx(archivedir, extid, date=None): first_crx_path = "" tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ".tar") if os.path.exists(tar): t = tarfile.open(tar, 'r') old_crxs = sorted([ x.name for x in t.getmembers() if x.name.endswith(".crx") and x.size > 0 and ( date is None or (date <= dateutil.parser.parse( os.path.split(os.path.split(x.name)[0])[1]))) ]) t.close() if old_crxs: first_crx_path = old_crxs[0] return first_crx_path
def iter_tar_entries_from_file_ext(archivedir, extid, ext): tar = os.path.join(archivedir, get_local_archive_dir(extid), extid + ext) with tarfile.open(tar, 'r') as tf: for tarentry in tf: if tarentry.isfile(): yield (tarentry, tf.extractfile(tarentry))
def update_extension(tup): archivedir, con, ext_id, forums = tup update_db = False set_logger_tag(ext_id) log_info("Updating extension {}".format(" (including forums)" if forums else ""), 1) is_new = False tar_exception = None sql_exception = None sql_success = False tmptardir = "" start = time.time() date = datetime.datetime.now(datetime.timezone.utc).isoformat() tardir = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id) tar = (tardir + ".tar") try: tmpdir = tempfile.mkdtemp() tmptardir = os.path.join(tmpdir, ext_id) log_info("* tmptardir = {}".format(tmptardir), 2) os.makedirs( os.path.join(archivedir, get_local_archive_dir(ext_id)), exist_ok=True) except Exception as e: log_exception("* FATAL: cannot create tmpdir", 3) tar_exception = e return UpdateResult(ext_id, is_new, tar_exception, None, None, None, None, sql_exception, False) res_overview = update_overview(tmptardir, date, ext_id) res_reviews = None res_support = None if forums: res_reviews = update_reviews(tmptardir, date, ext_id) res_crx = update_crx(archivedir, tmptardir, ext_id, date) if forums: res_support = update_support(tmptardir, date, ext_id) backup = False if backup: try: os.sync() if os.path.exists(tardir + "bak.tar"): shutil.move(tardir + ".bak.tar", tardir + ".bak." + date + ".tar") os.remove(tardir + ".bak." + date + ".tar") except Exception: pass try: if os.path.exists(tar): shutil.copyfile(tar, tardir + ".bak.tar") except Exception as e: log_exception("* FATAL: cannot rename old tar archive", 3) tar_exception = e try: write_text(tardir, date, ext_id + ".tar.rename.exception", traceback.format_exc()) except Exception: pass if not os.path.exists(tar): is_new = True try: start = time.time() with tarfile.open(tar, mode='a:') as ar: ar.add(tmptardir, arcname=ext_id) log_info("* Appending new data to tar took {:.2f}s".format(time.time() - start), 2) except Exception as e: log_exception("* FATAL: cannot create tar archive", 3) tar_exception = e try: write_text(tardir, date, ext_id + ".tar.create.exception", traceback.format_exc()) except Exception: pass if update_db: try: update_db_incremental(tmptardir, ext_id, date, con) sql_success = True except Exception as e: log_exception("* Exception during update of db", 3) sql_exception = e try: write_text(tardir, date, ext_id + ".sql.exception", traceback.format_exc()) except Exception: pass else: log_info("* DB Update disabled") try: shutil.rmtree(path=tmpdir) except Exception as e: log_exception("* FATAL: cannot remove archive directory", 3) tar_exception = e try: write_text(tardir, date, ext_id + ".dir.remove.exception", traceback.format_exc()) except Exception: pass log_info("* Duration: {}".format(datetime.timedelta(seconds=int(time.time() - start))), 2) return UpdateResult(ext_id, is_new, tar_exception, res_overview, res_crx, res_reviews, res_support, sql_exception, sql_success)
def update_crx(archivedir, tmptardir, ext_id, date): res = None extfilename = "default_ext_archive.crx" last_crx_file, last_crx_etag = last_crx(archivedir, ext_id) last_crx_http_date = last_modified_http_date(last_crx_file) headers = "" if last_crx_file is not "": headers = {'If-Modified-Since': last_crx_http_date} try: log_info("* Checking If-Modified-Since", 2) with request_manager.normal_request(): res = requests.get( const_download_url().format(ext_id), stream=True, headers=headers, timeout=10) log_info("* crx archive (Last: {}): {}".format(value_of(last_crx_http_date, "n/a"), str(res.status_code)), 2) extfilename = os.path.basename(res.url) if re.search('&', extfilename): extfilename = "default.crx" if res.status_code == 304: with request_manager.normal_request(): etag = requests.head( const_download_url().format(ext_id), timeout=10, allow_redirects=True).headers.get('ETag') write_text(tmptardir, date, extfilename + ".etag", etag) log_info("- checking etag, last: {}".format(last_crx_etag), 3) log_info(" current: {}".format(etag), 3) if (etag is not "") and (etag != last_crx_etag): log_info("- downloading due to different etags", 3) with request_manager.normal_request(): res = requests.get( const_download_url().format(ext_id), stream=True, timeout=10) else: write_text(tmptardir, date, extfilename + ".link", os.path.join("..", last_modified_utc_date(last_crx_file), extfilename) + "\n") store_request_metadata(tmptardir, date, extfilename, res) if res.status_code == 200: validate_crx_response(res, ext_id, extfilename) with open(os.path.join(tmptardir, date, extfilename), 'wb') as f: for chunk in res.iter_content(chunk_size=512 * 1024): if chunk: # filter out keep-alive new chunks f.write(chunk) write_text(tmptardir, date, extfilename + ".etag", res.headers.get("ETag")) etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag") with open(etag_file, 'w') as f: json.dump({ "last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag") }, f) except Exception as e: log_exception("Exception when updating crx", 3) write_text(tmptardir, date, extfilename + ".exception", traceback.format_exc()) return RequestResult(res, e) return RequestResult(res)