def exposed_delete_spcnet_invalid_url_pages(): ''' So the spcnet.tv forum software generates THOUSANDS of garbage links somehow. Anyways, delete those. ''' with db.session_context() as sess: tables = [ db.WebPages.__table__, version_table(db.WebPages.__table__) ] for ctbl in tables: # Print Querying for affected rows q = sess.query(ctbl.c.id) \ .filter(ctbl.c.netloc == "www.spcnet.tv") \ .filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%')) print("Query:") print(q) ids = q.all() ids = set(ids) # Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs. ids = [tmp[0] for tmp in ids] print("Fount %s rows requring deletion. Deleting." % len(ids)) delete_internal(sess, ids) sess.commit()
def relink_row_sequence(self, sess, rows): ''' Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id` except the most recent (where the value of end_transaction_id is `None`) Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the id of the next transaction, so the history linked list works correctly. ''' ctbl = version_table(db.WebPages.__table__) rows.sort(reverse=True, key=lambda x: (x.id, x.transaction_id, x.end_transaction_id)) end_transaction_id = None dirty = False for x in rows: if x.end_transaction_id != end_transaction_id: self.log.info("Need to change end_transaction_id from %s to %s", x.end_transaction_id, end_transaction_id) update = ctbl.update().where(ctbl.c.id == x.id).where(ctbl.c.transaction_id == x.transaction_id).values(end_transaction_id=end_transaction_id) # print(update) sess.execute(update) dirty = True end_transaction_id = x.transaction_id return dirty
def exposed_purge_raw_invalid_urls_from_history(): ''' Delete all raw-archiver rows that aren't attached to a archiver module. ''' sess1 = db.get_db_session(postfix='iter_sess') sess2 = db.get_db_session(postfix='delete_sess') ctbl = version_table(db.RawWebPages.__table__) print("Loading files from database...") # spinner1 = Spinner() est = sess1.execute("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';") res = est.scalar() print("Estimated row-count: %s" % res) last_bad = "" deleted = 0 total_rows = 0 last_commit = 0 maxlen = 0 changed_rows = 0 with tqdm.tqdm(total=res) as pbar: bad = 0 for rurl, rnetloc in sess1.query(ctbl.c.url, ctbl.c.netloc).yield_per(1000): modules_wants_url = any([mod.cares_about_url(rurl) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES]) has_badwords = any([badword in rurl for badword in common.global_constants.GLOBAL_BAD_URLS]) if not modules_wants_url or has_badwords: last_bad = rnetloc # print("Unwanted: ", rurl) changed_rows = sess2.query(ctbl) \ .filter(ctbl.c.url == rurl) \ .delete(synchronize_session=False) bad += 1 deleted += 1 total_rows += 1 if bad > 5000: # print("Committing!") bad = 0 last_commit = deleted sess2.commit() # pbar.set_description("Doing Commit", refresh=True) else: msg = "Deleted: %s, since commit: %s, last_bad: '%s' (%s, %s%%)" % \ (deleted, deleted-last_commit, last_bad, changed_rows, 100.0*(deleted / total_rows)) maxlen = max(len(msg), maxlen) pbar.set_description(msg.ljust(maxlen), refresh=False) pbar.update(n=1) sess1.commit() sess2.commit()
def checkHaveHistory(self, sess, url): # Only do version fiddling if versioning is enabled. if not settings.DO_VERSIONING: return 99 ctbl = version_table(self.db.RawWebPages.__table__) query = sess.query(exists().where(ctbl.c.url == url)) return query.scalar()
def render(): req_url = request.args.get('url') if not req_url: return build_error_response(message="Error! No page specified!") req_url = request.args.get('url') version = request.args.get('version') ignore_cache = request.args.get("nocache") filterstate = get_filter_state_for_url(req_url) try: if version == "None": version = None else: version = ast.literal_eval(version) except ValueError: return build_error_response( message="Error! Historical version number must be an integer!") if version and ignore_cache: return build_error_response( message="Error! Cannot render a historical version with nocache!") if version: rid, tid = version print("Historical row id: ", rid, tid) ctbl = version_table(db.WebPages.__table__) rows = g.session.query(ctbl.c.title, ctbl.c.content) \ .filter(ctbl.c.id == rid) \ .filter(ctbl.c.transaction_id == tid) \ .all() if rows: row = rows.pop() title, content = row content = utilities.replace_links(content) cachestate = "Historical version: %s" % (version, ) else: title, content, cachestate = WebMirror.API.getPage( req_url, ignore_cache=ignore_cache, version=version) # print("Render-Version: ", version, type(version)) # print("Rendering with nocache=", ignore_cache) # print("Return:", cachestate) response = jsonify( title=title, contents=content, cachestate=cachestate, filterstate=filterstate, req_url=req_url, ) return set_cache_control_headers(response)
def checkHaveHistory(self, url): # Only do version fiddling if versioning is enabled. if not settings.DO_VERSIONING: return 99 ctbl = version_table(db.WebPages.__table__) count = self.db_sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .count() return count
def exposed_delete_gravitytales_bot_blocked_pages(): ''' Delete the "checking you're not a bot" garbage pages that sometimes get through the gravitytales scraper. ''' with db.session_context() as sess: tables = [db.WebPages.__table__, version_table(db.WebPages.__table__)] for ctbl in tables: update = ctbl.delete() \ .where(ctbl.c.netloc == "gravitytales.com") \ .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%')) print(update) sess.execute(update) sess.commit()
def delete_internal(sess, ids): if ids: print("Doint delete. %s rows requiring update." % (len(ids), )) else: print("No rows needing deletion.") return ctbl = version_table(db.WebPages.__table__) chunk_size = 5000 for chunk_idx in range(0, len(ids), chunk_size): chunk = ids[chunk_idx:chunk_idx+chunk_size] while 1: try: # Allow ids that only exist in the history table by falling back to a # history-table query if the main table doesn't have the ID. try: ex = sess.query(db.WebPages.url).filter(db.WebPages.id == chunk[0]).one()[0] except sqlalchemy.orm.exc.NoResultFound: ex = sess.query(ctbl.c.url).filter(ctbl.c.id == chunk[0]).all()[0][0] print("Example removed URL: '%s'" % (ex)) q1 = sess.query(db.WebPages).filter(db.WebPages.id.in_(chunk)) affected_rows_main = q1.delete(synchronize_session=False) q2 = sess.query(ctbl).filter(ctbl.c.id.in_(chunk)) affected_rows_ver = q2.delete(synchronize_session=False) sess.commit() print("Deleted %s rows (%s version table rows). %0.2f%% done." % (affected_rows_main, affected_rows_ver, 100 * ((chunk_idx) / len(ids)))) break except sqlalchemy.exc.InternalError: print("Transaction error (sqlalchemy.exc.InternalError). Retrying.") sess.rollback() except sqlalchemy.exc.OperationalError: print("Transaction error (sqlalchemy.exc.OperationalError). Retrying.") sess.rollback() except sqlalchemy.exc.IntegrityError: print("Transaction error (sqlalchemy.exc.IntegrityError). Retrying.") sess.rollback() except sqlalchemy.exc.InvalidRequestError: print("Transaction error (sqlalchemy.exc.InvalidRequestError). Retrying.") traceback.print_exc() sess.rollback()
def view_history(): req_url = request.args.get('url') if not req_url: return render_template('error.html', title='Viewer', message="Error! No page specified!") version = request.args.get('version') delete_id = request.args.get("delete_id") delete = request.args.get("delete") print(version, delete_id, delete) if version and not (delete_id or delete): return render_template('view.html', title='Rendering Content', req_url=req_url, version=version) versions = [] ctbl = version_table(db.WebPages.__table__) versions = g.session.query(ctbl.c.id, ctbl.c.state, ctbl.c.addtime, ctbl.c.fetchtime, ctbl.c.transaction_id) \ .filter(ctbl.c.url == req_url) \ .order_by(ctbl.c.fetchtime) \ .all() versions = list(enumerate(versions)) versions.reverse() if delete_id and delete: return do_history_delete(versions, version, delete_id, delete) return render_template('history.html', title='Item History', req_url=req_url, versions=versions)
def truncate_url_history(self, sess, url): # last_check = db.get_from_version_check_table(sess, url) # if last_check > datetime.datetime.now() - FLATTEN_SCAN_INTERVAL: # self.log.info("Url %s checked within the check interval (%s, %s). Skipping.", url, FLATTEN_SCAN_INTERVAL, last_check) # return 0 # else: # self.log.info("Url %s last checked %s.", url, last_check) ctbl = version_table(db.WebPages.__table__) if url in self.feed_urls: self.log.info("Feed URL (%s)! Deleting history wholesale!", url) # res = sess.execute( # ctbl.delete() \ # .where(ctbl.c.url == url) # ) # self.log.info("Modified %s rows", res.rowcount) # sess.commit() # self.log.info("Committed. Setting version log.") # db.set_in_version_check_table(sess, url, datetime.datetime.now()) # new_val = db.get_from_version_check_table(sess, url) # self.log.info("New value from DB: %s", new_val) return self.log.info("Counting rows for url %s.", url) orig_cnt = sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .count() self.log.info("Found %s results for url %s. Fetching rows", orig_cnt, url) deleted_1 = 0 deleted_2 = 0 datevec = self.snap_times attachments = {} for item in tqdm.tqdm( sess.query(ctbl) \ .filter(ctbl.c.url == url) \ .order_by(ctbl.c.id, ctbl.c.transaction_id) \ .yield_per(50), total=orig_cnt): if item.state != "complete" and item.state != 'error': deleted_1 += 1 self.log.info("Deleting incomplete item for url: %s (state: %s)!", url, item.state) sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) elif item.content is None and item.file is None: self.log.info("Deleting item without a file and no content for url: %s!", url) # print(type(item), item.mimetype, item.file, item.content) # print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id)) deleted_1 += 1 elif item.content != None: closest = min(datevec, key=self.diff_func(item)) if not closest in attachments: attachments[closest] = [] attachments[closest].append({ 'addtime' : item.addtime, 'fetchtime' : item.fetchtime, 'id' : item.id, 'transaction_id' : item.transaction_id, }) elif item.file != None: pass else: print("Wat?") self.log.info("Found %s items missing both file reference and content", deleted_1) keys = list(attachments.keys()) keys.sort() out = [] for key in tqdm.tqdm(keys): superset = attachments[key] if len(superset) > 1: # print("lolercoaster") superset.sort(key=lambda x: (x['addtime'] if x['fetchtime'] is None else x['fetchtime'], x['id'], x['transaction_id']), reverse=True) out.append(superset[0]) # print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id) self.log.info("Deleting %s items (out of %s) from date-segment %s", len(superset)-1, len(superset), key) for tmp in superset[1:]: sess.execute(ctbl.delete().where(ctbl.c.id == tmp['id']).where(ctbl.c.transaction_id == tmp['transaction_id'])) deleted_2 += 1 elif len(superset) == 1: out.append(superset[0]) else: raise ValueError("Wat? Key with no items!") deleted = deleted_1 + deleted_2 # seq_dirty = self.relink_row_sequence(sess, out) # if deleted > 0 or seq_dirty: if deleted > 0: # Rewrite the tid links so the history renders properly # self.log.info("Committing because %s items were removed!", deleted) sess.commit() else: sess.rollback() db.set_in_version_check_table(sess, url, datetime.datetime.now()) self.log.info("Deleted: %s items when simplifying history, %s incomplete items, Total deleted: %s, remaining: %s", deleted_2, deleted_1, deleted, orig_cnt-deleted) return deleted
def checkHaveHistory(self, url): ctbl = version_table(self.db.RawWebPages.__table__) (count, ), = self.db_sess.query(exists().where(ctbl.c.url == url)) return count
def sync_filtered_with_filesystem(): with db.session_context() as sess: tgtpath = settings.RESOURCE_DIR ctbl = version_table(db.RawWebPages.__table__) print("Loading files from database...") # spinner1 = Spinner() est = sess.execute( "SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages';" ) res = est.scalar() vest = sess.execute( "SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';" ) vres = vest.scalar() print("Estimated row-count: %s, version table: %s" % (res, vres)) in_main_db = [] with tqdm(total=res) as pbar: for row in sess.query(db.WebFiles).yield_per(10000): if row.fspath: in_main_db.append(row.fspath) pbar.update(n=1) in_history_db = [] with tqdm(total=vres) as pbar: for rfspath, in sess.query(ctbl.c.fspath).yield_per(1000): if rfspath: in_history_db.append(rfspath) pbar.update(n=1) origl_main = len(in_main_db) origl_hist = len(in_history_db) in_db_main = set(in_main_db) in_db_hist = set(in_history_db) in_db = in_db_main + in_db_hist print("") print("%s files, %s unique" % ((origl_main, origl_hist), (len(in_db_main), len(in_db_hist)))) print("Enumerating files from disk...") agg_files = [] have_files = [] # spinner2 = Spinner() with tqdm(total=len(in_db)) as pbar: for root, _, files in os.walk(tgtpath): for filen in files: fqpath = os.path.join(root, filen) fpath = fqpath[len(tgtpath) + 1:] if fpath in in_db: pbar.update(n=1) # spinner2.next(star=True, vlen=0) have_files.append(fpath) else: pbar.update(n=1) # spinner2.next(vlen=1) agg_files.append(fpath) fqpath = os.path.join(tgtpath, fpath) # os.unlink(fqpath) print("\rDeleting: %s " % fqpath)
def do_history_delete(versions, version, delete_id, delete): ctbl = version_table(db.WebPages.__table__) if delete != "True": return render_template('error.html', title='Error when deleting!', message="Delete param not true?") try: version = int(version) except ValueError: return render_template( 'error.html', title='Error when deleting!', message="Cannot convert version value to integer!") try: delete_id = int(delete_id) except ValueError: return render_template( 'error.html', title='Error when deleting!', message="Cannot convert delete_id value to integer!") versions = dict(versions) if delete_id == -1 and version == -1: maxid = max(versions.keys()) for vid, version in versions.items(): if vid != maxid: rid, tid = version.id, version.transaction_id print("Deleting:", version, rid, tid) g.session.query(ctbl) \ .filter(ctbl.c.id == rid) \ .filter(ctbl.c.transaction_id == tid) \ .delete(synchronize_session=False) g.session.commit() return render_template('error.html', title='All old versions deleted', message="All old versions deleted") else: if not version in versions: return render_template( 'error.html', title='Error when deleting!', message="Version value doesn't exist? ('%s', '%s')" % (version, type(version))) target = versions[version] if not target.id == delete_id: return render_template( 'error.html', title='Error when deleting!', message="Delete row PK Id doesn't match specified delete ID?") print("Deleting:", target) g.session.query(ctbl) \ .filter(ctbl.c.id == target.id) \ .filter(ctbl.c.transaction_id == target.transaction_id) \ .delete(synchronize_session=False) g.session.commit() return render_template('error.html', title='Row deleted', message="Row: '%s', '%s'" % (delete_id, version))
def view_rendered(): req_url = request.args.get('url') if not req_url: return render_template('error.html', title='Error', message="Error! No page specified!") req_url = request.args.get('url') version = request.args.get('version') ignore_cache = request.args.get("nocache") filterstate = get_filter_state_for_url(req_url) try: if version == "None" or version is None: version = None else: version = ast.literal_eval(version) except ValueError: traceback.print_exc() return render_template( 'error.html', title='Error', message="Error! Historical version number must be an integer!") if version and ignore_cache: return render_template( 'error.html', title='Error', message="Error! Cannot render a historical version with nocache!") if version: rid, tid = version print("Historical row id: ", rid, tid) ctbl = version_table(db.WebPages.__table__) rows = g.session.query(ctbl.c.title, ctbl.c.content) \ .filter(ctbl.c.id == rid) \ .filter(ctbl.c.transaction_id == tid) \ .all() if rows: row = rows.pop() title, content = row content = utilities.replace_links(content) cachestate = "Historical version: %s" % (version, ) else: title, content, cachestate = WebMirror.API.getPage( req_url, ignore_cache=ignore_cache, version=version) response = make_response( render_template( 'view-rendered.html', req_url=req_url, version=version, title=title, contents=content, cachestate=cachestate, filterstate=filterstate, )) return set_cache_control_headers(response, allow_inline=True)