Exemple #1
0
def exposed_delete_spcnet_invalid_url_pages():
	'''
	So the spcnet.tv forum software generates THOUSANDS of garbage links somehow.
	Anyways, delete those.
	'''
	sess = db.get_db_session()
	tables = [
		db.WebPages.__table__,
		version_table(db.WebPages)
	]

	for ctbl in tables:
		# Print Querying for affected rows
		q = sess.query(ctbl.c.id) \
			.filter(ctbl.c.netloc == "www.spcnet.tv") \
			.filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%'))
		print("Query:")
		print(q)
		ids = q.all()

		ids = set(ids)

		# Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs.
		ids = [tmp[0] for tmp in ids]

		print("Fount %s rows requring deletion. Deleting." % len(ids))
		delete_internal(sess, ids)
		sess.commit()
Exemple #2
0
	def checkHaveHistory(self, url):
		ctbl = version_table(db.WebPages)

		count = self.db_sess.query(ctbl) \
			.filter(ctbl.c.url == url)   \
			.count()
		return count
	def relink_row_sequence(self, rows):
		'''
		Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id`
		except the most recent (where the value of end_transaction_id is `None`)

		Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the
		id of the next transaction, so the history linked list works correctly.
		'''

		ctbl = version_table(db.WebPages)

		rows.sort(reverse=True, key=lambda x: (x.id, x.transaction_id, x.end_transaction_id))
		end_transaction_id = None
		dirty = False
		for x in rows:
			if x.end_transaction_id != end_transaction_id:
				self.log.info("Need to change end_transaction_id from %s to %s", x.end_transaction_id, end_transaction_id)

				update = ctbl.update().where(ctbl.c.id == x.id).where(ctbl.c.transaction_id == x.transaction_id).values(end_transaction_id=end_transaction_id)
				# print(update)
				self.sess.execute(update)

				dirty = True
			end_transaction_id = x.transaction_id

		return dirty
    def relink_row_sequence(self, sess, rows):
        '''
		Each Sqlalchemy-Continum transaction references the next transaction in the chain as it's `end_transaction_id`
		except the most recent (where the value of end_transaction_id is `None`)

		Therefore, we iterate over the history in reverse, and for each item set it's `end_transaction_id` to the
		id of the next transaction, so the history linked list works correctly.
		'''

        ctbl = version_table(db.WebPages)

        rows.sort(reverse=True,
                  key=lambda x: (x.id, x.transaction_id, x.end_transaction_id))
        end_transaction_id = None
        dirty = False
        for x in rows:
            if x.end_transaction_id != end_transaction_id:
                self.log.info(
                    "Need to change end_transaction_id from %s to %s",
                    x.end_transaction_id, end_transaction_id)

                update = ctbl.update().where(ctbl.c.id == x.id).where(
                    ctbl.c.transaction_id == x.transaction_id).values(
                        end_transaction_id=end_transaction_id)
                # print(update)
                sess.execute(update)

                dirty = True
            end_transaction_id = x.transaction_id

        return dirty
    def checkHaveHistory(self, url):
        ctbl = version_table(self.db.RawWebPages)

        count = self.db_sess.query(ctbl) \
         .filter(ctbl.c.url == url)   \
         .count()
        return count
Exemple #6
0
def render():
    req_url = request.args.get('url')
    if not req_url:
        return build_error_response(message="Error! No page specified!")
    req_url = request.args.get('url')

    version = request.args.get('version')
    ignore_cache = request.args.get("nocache")

    filterstate = get_filter_state_for_url(req_url)

    try:
        if version == "None":
            version = None
        else:
            version = ast.literal_eval(version)
    except ValueError:
        return build_error_response(
            message="Error! Historical version number must be an integer!")

    if version and ignore_cache:
        return build_error_response(
            message="Error! Cannot render a historical version with nocache!")

    if version:
        rid, tid = version

        print("Historical row id: ", rid, tid)

        ctbl = version_table(db.WebPages)

        rows = g.session.query(ctbl.c.title, ctbl.c.content) \
         .filter(ctbl.c.id == rid)                        \
         .filter(ctbl.c.transaction_id == tid)            \
         .all()

        if rows:
            row = rows.pop()
            title, content = row
            content = utilities.replace_links(content)
            cachestate = "Historical version: %s" % (version, )
    else:
        title, content, cachestate = WebMirror.API.getPage(
            req_url, ignore_cache=ignore_cache, version=version)

    # print("Render-Version: ", version, type(version))
    # print("Rendering with nocache=", ignore_cache)
    # print("Return:", cachestate)
    response = jsonify(
        title=title,
        contents=content,
        cachestate=cachestate,
        filterstate=filterstate,
        req_url=req_url,
    )
    return set_cache_control_headers(response)
Exemple #7
0
	def checkHaveHistory(self, url):
		# Only do version fiddling if versioning is enabled.
		if not settings.DO_VERSIONING:
			return 99

		ctbl = version_table(db.WebPages)

		count = self.db_sess.query(ctbl) \
			.filter(ctbl.c.url == url)   \
			.count()
		return count
def exposed_delete_gravitytales_bot_blocked_pages():
    '''
	Delete the "checking you're not a bot" garbage pages
	that sometimes get through the gravitytales scraper.
	'''
    sess = db.get_db_session()
    tables = [db.WebPages.__table__, version_table(db.WebPages)]

    for ctbl in tables:
        update = ctbl.delete() \
         .where(ctbl.c.netloc == "gravitytales.com") \
         .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%'))
        print(update)
        sess.execute(update)
        sess.commit()
Exemple #9
0
def delete_internal(sess, ids):

	if ids:
		print("Doint delete. %s rows requiring update." % (len(ids), ))
	else:
		print("No rows needing deletion.")
		return

	ctbl = version_table(db.WebPages)
	chunk_size = 5000
	for chunk_idx in range(0, len(ids), chunk_size):
		chunk = ids[chunk_idx:chunk_idx+chunk_size]
		while 1:
			try:

				# Allow ids that only exist in the history table by falling back to a
				# history-table query if the main table doesn't have the ID.
				try:
					ex = sess.query(db.WebPages.url).filter(db.WebPages.id == chunk[0]).one()[0]
				except sqlalchemy.orm.exc.NoResultFound:
					ex = sess.query(ctbl.c.url).filter(ctbl.c.id == chunk[0]).all()[0][0]

				print("Example removed URL: '%s'" % (ex))


				q1 = sess.query(db.WebPages).filter(db.WebPages.id.in_(chunk))
				affected_rows_main = q1.delete(synchronize_session=False)

				q2 = sess.query(ctbl).filter(ctbl.c.id.in_(chunk))
				affected_rows_ver = q2.delete(synchronize_session=False)

				sess.commit()
				print("Deleted %s rows (%s version table rows). %0.2f%% done." %
						(affected_rows_main, affected_rows_ver,  100 * ((chunk_idx) / len(ids))))
				break
			except sqlalchemy.exc.InternalError:
				print("Transaction error (sqlalchemy.exc.InternalError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.OperationalError:
				print("Transaction error (sqlalchemy.exc.OperationalError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.IntegrityError:
				print("Transaction error (sqlalchemy.exc.IntegrityError). Retrying.")
				sess.rollback()
			except sqlalchemy.exc.InvalidRequestError:
				print("Transaction error (sqlalchemy.exc.InvalidRequestError). Retrying.")
				traceback.print_exc()
				sess.rollback()
Exemple #10
0
def view_history():
    req_url = request.args.get('url')
    if not req_url:
        return render_template('error.html',
                               title='Viewer',
                               message="Error! No page specified!")

    version = request.args.get('version')
    delete_id = request.args.get("delete_id")
    delete = request.args.get("delete")

    print(version, delete_id, delete)

    if version and not (delete_id or delete):
        return render_template('view.html',
                               title='Rendering Content',
                               req_url=req_url,
                               version=version)

    versions = []

    ctbl = version_table(db.WebPages)


    versions = g.session.query(ctbl.c.id, ctbl.c.state, ctbl.c.fetchtime, ctbl.c.transaction_id) \
     .filter(ctbl.c.url == req_url)                                    \
     .order_by(ctbl.c.fetchtime)                                       \
     .all()

    versions = list(enumerate(versions))
    versions.reverse()

    if delete_id and delete:
        return do_history_delete(versions, version, delete_id, delete)

    return render_template('history.html',
                           title='Item History',
                           req_url=req_url,
                           versions=versions)
Exemple #11
0
def purge_invalid_urls(selected_netloc=None):


	sess = db.get_db_session()
	for ruleset in WebMirror.rules.load_rules():

		if      (
						(ruleset['netlocs'] and ruleset['badwords'])
					and
					(
						(ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None)
						or
						(selected_netloc != None and selected_netloc in ruleset['netlocs'])
					)
				):
			# We have to delete from the normal table before the versioning table,
			# because deleting from the normal table causes inserts into the versioning table
			# due to the change tracking triggers.

			count = 1

			loc = and_(
					db.WebPages.netloc.in_(ruleset['netlocs']),
					or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				)
			# print("Doing count on table ")
			# count = sess.query(db.WebPages) \
			# 	.filter(or_(*opts)) \
			# 	.count()


			if count == 0:
				print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(db.WebPages) \
					.filter(or_(*loc)) \
					.delete(synchronize_session=False)


			# # Do the delete from the versioning table now.
			ctbl = version_table(db.WebPages)
			loc2 = and_(
					ctbl.c.netloc.in_(ruleset['netlocs']),
					or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				)
			# print("Doing count on Versioning table ")
			# count = sess.query(ctbl) \
			# 	.filter(or_(*opts)) \
			# 	.count()

			if count == 0:
				print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(ctbl) \
					.filter(or_(*loc2)) \
					.delete(synchronize_session=False)



			sess.commit()
Exemple #12
0
def do_history_delete(versions, version, delete_id, delete):

    ctbl = version_table(db.WebPages)

    if delete != "True":
        return render_template('error.html',
                               title='Error when deleting!',
                               message="Delete param not true?")
    try:
        version = int(version)
    except ValueError:
        return render_template(
            'error.html',
            title='Error when deleting!',
            message="Cannot convert version value to integer!")
    try:
        delete_id = int(delete_id)
    except ValueError:
        return render_template(
            'error.html',
            title='Error when deleting!',
            message="Cannot convert delete_id value to integer!")

    versions = dict(versions)

    if delete_id == -1 and version == -1:
        maxid = max(versions.keys())

        for vid, version in versions.items():
            if vid != maxid:
                rid, tid = version.id, version.transaction_id
                print("Deleting:", version, rid, tid)
                g.session.query(ctbl)                     \
                 .filter(ctbl.c.id             == rid) \
                 .filter(ctbl.c.transaction_id == tid) \
                 .delete(synchronize_session=False)
        g.session.commit()
        return render_template('error.html',
                               title='All old versions deleted',
                               message="All old versions deleted")

    else:
        if not version in versions:
            return render_template(
                'error.html',
                title='Error when deleting!',
                message="Version value doesn't exist? ('%s', '%s')" %
                (version, type(version)))
        target = versions[version]
        if not target.id == delete_id:
            return render_template(
                'error.html',
                title='Error when deleting!',
                message="Delete row PK Id doesn't match specified delete ID?")

        print("Deleting:", target)
        g.session.query(ctbl)                                       \
         .filter(ctbl.c.id == target.id)                         \
         .filter(ctbl.c.transaction_id == target.transaction_id) \
         .delete(synchronize_session=False)
        g.session.commit()

        return render_template('error.html',
                               title='Row deleted',
                               message="Row: '%s', '%s'" %
                               (delete_id, version))
def exposed_purge_invalid_urls(selected_netloc=None):
	'''
	Iterate over each ruleset in the rules directory, and generate a compound query that will
	delete any matching rows.
	For rulesets with a large number of rows, or many badwords, this
	can be VERY slow.

	Similar in functionality to `clear_bad`, except it results in many fewer queryies,
	and is therefore likely much more performant.
	'''

	sess = db.get_db_session()
	for ruleset in WebMirror.rules.load_rules():

		if      (
						(ruleset['netlocs'] and ruleset['badwords'])
					and
					(
						(ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None)
						or
						(selected_netloc != None and selected_netloc in ruleset['netlocs'])
					)
				):
			# We have to delete from the normal table before the versioning table,
			# because deleting from the normal table causes inserts into the versioning table
			# due to the change tracking triggers.

			count = 1
			ands = [
					or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				]

			if selected_netloc:
				ands.append((db.WebPages.netloc == selected_netloc))
			else:
				ands.append(db.WebPages.netloc.in_(ruleset['netlocs']))


			loc = and_(*ands)
			# print("Doing count on table ")
			# count = sess.query(db.WebPages) \
			# 	.filter(or_(*opts)) \
			# 	.count()

			if selected_netloc:
				print(loc)

			if count == 0:
				print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(db.WebPages) \
					.filter(or_(*loc)) \
					.delete(synchronize_session=False)


			# # Do the delete from the versioning table now.
			ctbl = version_table(db.WebPages)
			loc2 = and_(
					ctbl.c.netloc.in_(ruleset['netlocs']),
					or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords']))
				)
			# print("Doing count on Versioning table ")
			# count = sess.query(ctbl) \
			# 	.filter(or_(*opts)) \
			# 	.count()

			if count == 0:
				print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count))
			else:
				print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count))

				sess.query(ctbl) \
					.filter(or_(*loc2)) \
					.delete(synchronize_session=False)



			sess.commit()
Exemple #14
0
	def truncate_url_history(self, url):
		ctbl = version_table(db.WebPages)

		items = self.sess.query(ctbl) \
			.filter(ctbl.c.url == url) \
			.all()

		items.sort(key=lambda x: (x.id, x.transaction_id, x.end_transaction_id))
		# for x in items:
		# 	print(x.id, x.transaction_id, x.end_transaction_id)

		deleted_1 = 0
		deleted_2 = 0

		orig_cnt = len(items)
		datevec = self.snap_times
		self.log.info("Clearing history for URL: %s (items: %s)", url, orig_cnt)
		attachments = {}
		for item in items:
			if item.state != "complete":
				deleted_1 += 1
				self.log.info("Deleting incomplete item for url: %s!", url)
				self.sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
			elif item.content == None and item.file == None:
				self.log.info("Deleting item without a file and no content for url: %s!", url)
				# print(type(item), item.mimetype, item.file, item.content)
				# print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
				self.sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
				deleted_1 += 1
			elif item.content != None:
				# print(type(item), item.keys(), item.addtime, item.fetchtime)
				closest = min(datevec, key=self.diff_func(item))
				if not closest in attachments:
					attachments[closest] = []
				attachments[closest].append(item)
			else:
				print("Wat?")


		self.log.info("Found %s items missing both file reference and content", deleted_1)
		keys = list(attachments.keys())
		keys.sort()

		out = []

		for key in keys:
			superset = attachments[key]
			if len(superset) > 1:
				# print("lolercoaster")
				superset.sort(key=lambda x: (x.addtime if x.fetchtime is None else x.fetchtime, x.id, x.transaction_id), reverse=True)
				out.append(superset[0])
				# print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id)
				for tmp in superset[1:]:
					self.sess.execute(ctbl.delete().where(ctbl.c.id == tmp.id).where(ctbl.c.transaction_id == tmp.transaction_id))
					deleted_2 += 1
			elif len(superset) == 1:
				out.append(superset[0])
			else:
				raise ValueError("Wat? Key with no items!")

		deleted = deleted_1 + deleted_2
		seq_dirty = self.relink_row_sequence(out)
		if deleted > 0 or seq_dirty:
			# Rewrite the tid links so the history renders properly
			self.log.info("Committing because %s items were removed!", deleted)
			self.sess.commit()
		else:
			self.sess.rollback()

		self.log.info("Deleted: %s items when simplifying history, Total deleted: %s, remaining: %s", deleted_2, deleted, orig_cnt-deleted)
    def truncate_url_history(self, sess, url):
        ctbl = version_table(db.WebPages)

        items = sess.query(ctbl) \
         .filter(ctbl.c.url == url) \
         .all()

        items.sort(
            key=lambda x: (x.id, x.transaction_id, x.end_transaction_id))
        # for x in items:
        # 	print(x.id, x.transaction_id, x.end_transaction_id)

        deleted_1 = 0
        deleted_2 = 0

        orig_cnt = len(items)
        datevec = self.snap_times
        self.log.info("Clearing history for URL: %s (items: %s)", url,
                      orig_cnt)
        attachments = {}
        for item in items:
            if item.state != "complete":
                deleted_1 += 1
                self.log.info("Deleting incomplete item for url: %s!", url)
                sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(
                    ctbl.c.transaction_id == item.transaction_id))
            elif item.content == None and item.file == None:
                self.log.info(
                    "Deleting item without a file and no content for url: %s!",
                    url)
                # print(type(item), item.mimetype, item.file, item.content)
                # print(ctbl.delete().where(ctbl.c.id == item.id).where(ctbl.c.transaction_id == item.transaction_id))
                sess.execute(ctbl.delete().where(ctbl.c.id == item.id).where(
                    ctbl.c.transaction_id == item.transaction_id))
                deleted_1 += 1
            elif item.content != None:
                # print(type(item), item.keys(), item.addtime, item.fetchtime)
                closest = min(datevec, key=self.diff_func(item))
                if not closest in attachments:
                    attachments[closest] = []
                attachments[closest].append(item)
            elif item.file != None:
                pass
            else:
                print("Wat?")

        self.log.info("Found %s items missing both file reference and content",
                      deleted_1)
        keys = list(attachments.keys())
        keys.sort()

        out = []

        for key in keys:
            superset = attachments[key]
            if len(superset) > 1:
                # print("lolercoaster")
                superset.sort(key=lambda x:
                              (x.addtime if x.fetchtime is None else x.
                               fetchtime, x.id, x.transaction_id),
                              reverse=True)
                out.append(superset[0])
                # print(superset[0].fetchtime, superset[0].id, superset[0].transaction_id)
                for tmp in superset[1:]:
                    sess.execute(
                        ctbl.delete().where(ctbl.c.id == tmp.id).where(
                            ctbl.c.transaction_id == tmp.transaction_id))
                    deleted_2 += 1
            elif len(superset) == 1:
                out.append(superset[0])
            else:
                raise ValueError("Wat? Key with no items!")

        deleted = deleted_1 + deleted_2
        seq_dirty = self.relink_row_sequence(sess, out)
        if deleted > 0 or seq_dirty:
            # Rewrite the tid links so the history renders properly
            self.log.info("Committing because %s items were removed!", deleted)
            sess.commit()
        else:
            sess.rollback()

        self.log.info(
            "Deleted: %s items when simplifying history, Total deleted: %s, remaining: %s",
            deleted_2, deleted, orig_cnt - deleted)
def exposed_purge_invalid_url_history():
    '''
	Functionally identical to `purge_invalid_urls`, but
	operates on the history table only.

	This means that it will operate on row IDs that have been deleted from the main DB (intentionally or not)
	'''

    with db.session_context() as sess:
        ctbl = version_table(db.WebPages)
        for ruleset in WebMirror.rules.load_rules():

            if ruleset['netlocs'] and ruleset['badwords']:

                agg_bad = [tmp for tmp in ruleset['badwords']]
                agg_bad.extend(common.global_constants.GLOBAL_BAD_URLS)

                # So there's no way to escape a LIKE string in postgres.....
                search_strs = [
                    "%{}%".format(
                        badword.replace(r"_", r"\_").replace(r"%",
                                                             r"\%").replace(
                                                                 r"\\", r"\\"))
                    for badword in agg_bad
                ]

                print("Badwords:")
                for bad in search_strs:
                    print("	Bad: ", bad)
                print("Netlocs:")
                print(ruleset['netlocs'])

                # We have to delete from the normal table before the versioning table,
                # because deleting from the normal table causes inserts into the versioning table
                # due to the change tracking triggers.

                ands = [or_(*(ctbl.c.url.like(ss) for ss in search_strs))]

                print("Filtering by all netlocs in rule file.")
                ands.append(ctbl.c.netloc.in_(ruleset['netlocs']))


                ids = sess.query(ctbl.c.id) \
                 .filter(and_(*ands))                 \
                 .all()

                # Collapse duplicates
                ids = set(ids)

                if ids == 0:
                    print(
                        "{num} items match badwords from file {file}. No deletion required "
                        .format(file=ruleset['filename'], num=len(ids)))
                else:
                    print(
                        "{num} items match badwords from file {file}. Deleting "
                        .format(file=ruleset['filename'], num=len(ids)))

                # Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs.
                ids = [tmp[0] for tmp in ids]
                delete_internal(sess, ids, ruleset['netlocs'],
                                ruleset['badwords'])