Beispiel #1
0
def db_fiddle():
	print("Fixing DB things.")
	print("Getting IDs")
	have = db.get_session().execute("""
		SELECT id FROM web_pages WHERE url LIKE 'https://www.wattpad.com/story%' AND state != 'new';
		""")
	print("Query executed. Fetching results")
	have = list(have)
	print(len(have))
	count = 0

	chunk = []
	for item, in have:
		chunk.append(item)

		count += 1
		if count % 1000 == 0:


			statement = db.get_session().query(db.WebPages) \
				.filter(db.WebPages.state != 'new')        \
				.filter(db.WebPages.id.in_(chunk))

			# statement = db.get_session().update(db.WebPages)
			statement.update({db.WebPages.state : 'new'}, synchronize_session=False)
			chunk = []
			print(count, item)
			db.get_session().commit()
Beispiel #2
0
	def do_link_batch_update(self):
		if not self.batched_links:
			return

		self.log.info("Inserting %s items into DB in batch.", len(self.batched_links))
		while 1:
			try:

				cmd = text("""
						INSERT INTO
							web_pages
							(url, starturl, netloc, distance, is_text, priority, type, fetchtime, state)
						VALUES
							(:url, :starturl, :netloc, :distance, :is_text, :priority, :type, :fetchtime, :state)
						ON CONFLICT DO NOTHING
						""")
				for paramset in self.batched_links:
					db.get_session().execute(cmd, params=paramset)
				db.get_session().commit()
				self.batched_links = []
				break
			except KeyboardInterrupt:
				self.log.info("Keyboard Interrupt?")
				db.get_session().rollback()
			except sqlalchemy.exc.InternalError:
				self.log.info("Transaction error. Retrying.")
				traceback.print_exc()
				db.get_session().rollback()
			except sqlalchemy.exc.OperationalError:
				self.log.info("Transaction error. Retrying.")
				traceback.print_exc()
				db.get_session().rollback()
Beispiel #3
0
def clear_blocked():
	for ruleset in WebMirror.rules.load_rules():
		if ruleset['netlocs'] and ruleset['badwords']:
			# mask = [db.WebPages.url.like("%{}%".format(tmp)) for tmp in ruleset['badwords'] if not "%" in tmp]

			for badword in ruleset['badwords']:
				feed_items = db.get_session().query(db.WebPages)          \
					.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
					.filter(db.WebPages.url.like("%{}%".format(badword))) \
					.count()

				if feed_items:
					print("Have:  ", feed_items, badword)
					feed_items = db.get_session().query(db.WebPages)          \
						.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
						.filter(db.WebPages.url.like("%{}%".format(badword))) \
						.delete(synchronize_session=False)
					db.get_session().expire_all()

				else:
					print("Empty: ", feed_items, badword)
			# print(mask)
			# print(ruleset['netlocs'])
			# print(ruleset['badwords'])
	pass
Beispiel #4
0
def clear_bad():
	from sqlalchemy.dialects import postgresql

	rules = WebMirror.rules.load_rules()

	for ruleset in rules:

		# print(ruleset['netlocs'])
		# print(ruleset.keys())
		for badword in ruleset['badwords']:
			if not ruleset['netlocs']:
				continue
			if "%" in badword:
				print(badword)
			else:

				q = db.get_session().query(db.WebPages)                   \
					.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
					.filter(db.WebPages.url.like("%{}%".format(badword)))
				items = q.count()
				if items:
					print("%s results for : '%s'" % (items, badword))

					q = db.get_session().query(db.WebPages)                   \
						.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
						.filter(db.WebPages.url.like("%{}%".format(badword))) \
						.delete(synchronize_session=False)
					db.get_session().commit()
Beispiel #5
0
def resetInProgress():
	print("Resetting any stalled downloads from the previous session.")

	# db.get_session().begin()
	db.get_session().query(db.WebPages) \
		.filter((db.WebPages.state == "fetching") | (db.WebPages.state == "processing"))   \
		.update({db.WebPages.state : "new"})
	db.get_session().commit()
Beispiel #6
0
def update_feed_names():
	for key, value in feedNameLut.mapper.items():
		feed_items = db.get_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == key)    \
				.all()
		if feed_items:
			for item in feed_items:
				item.srcname = value
			print(len(feed_items))
			print(key, value)
			db.get_session().commit()
	def doCall(self):

		session = db.get_session()
		item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one()
		if item.is_running:
			session.commit()
			self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name)
			return

		item.is_running = True
		item.last_run = datetime.datetime.now()
		session.commit()

		try:
			self._doCall()
		except Exception:
			item.last_error      = datetime.datetime.now()
			item.last_error_msg  = traceback.format_exc()
			raise
		finally:
			item.is_running = False
			item.last_run_end = datetime.datetime.now()
			session.commit()

			db.delete_session()
Beispiel #8
0
def delete_comment_feed_items():

	sess = db.get_session()
	bad = sess.query(db.FeedItems) \
			.filter(or_(
				db.FeedItems.contenturl.like("%#comment-%"),
				db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"),
				db.FeedItems.contenturl.like("%www.fanfiction.net%"),
				db.FeedItems.contenturl.like("%www.fictionpress.com%"),
				db.FeedItems.contenturl.like("%www.booksie.com%")))    \
			.order_by(db.FeedItems.contenturl) \
			.all()

	count = 0
	for bad in bad:
		print(bad.contenturl)

		while bad.author:
			bad.author.pop()
		while bad.tags:
			bad.tags.pop()
		sess.delete(bad)
		count += 1
		if count % 1000 == 0:
			print("Committing at %s" % count)
			sess.commit()

	print("Done. Committing...")
	sess.commit()
Beispiel #9
0
def longest_rows():
	print("Getting longest rows from database")
	have = db.get_session().execute("""
		SELECT
			id, url, length(content), content
		FROM
			web_pages
		ORDER BY
			LENGTH(content) DESC NULLS LAST
		LIMIT 50;
		""")
	print("Rows:")

	import os
	import os.path

	savepath = "./large_files/"
	for row in have:
		print(row[0], row[1])
		try:
			os.makedirs(savepath)
		except FileExistsError:
			pass
		with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp:
			urlst = "URL: %s\n\n" % row[1]
			size = "Length: %s\n\n" % row[2]
			fp.write(urlst.encode("utf-8"))
			fp.write(size.encode("utf-8"))
			fp.write("{}".format(row[3]).encode("utf-8"))
Beispiel #10
0
def fix_null():
	step = 50000


	end = db.get_session().execute("""SELECT MAX(id) FROM web_pages WHERE  ignoreuntiltime IS NULL;""")
	end = list(end)[0][0]

	start = db.get_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""")
	start = list(start)[0][0]

	changed = 0

	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		# SQL String munging! I'm a bad person!
		# Only done because I can't easily find how to make sqlalchemy
		# bind parameters ignore the postgres specific cast
		# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
		have = db.get_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step))
		# print()
		print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount))
		changed += have.rowcount
		if changed > 10000:
			print("Committing (%s changed rows)...." % changed, end=' ')
			db.get_session().commit()
			print("done")
			changed = 0
	db.get_session().commit()
def getDistinctNetlocs():
	query = database.get_session().query(database.WebPages)                 \
		.filter(database.WebPages.is_text == True)                    \
		.filter(database.WebPages.file == None)                       \
		.filter(database.WebPages.state == 'complete')                \
		.distinct(database.WebPages.netloc)                           \
		.with_entities(database.WebPages.id, database.WebPages.netloc)
	# print(query)
	vals = query.all()
	return vals
def handleRateLimiting(params, job, engine):
	allowable = params[0]
	with FETCH_LOCK:
		if not job.netloc in ACTIVE_FETCHES:
			ACTIVE_FETCHES[job.netloc] = 0

	log.info("Active fetchers for domain %s - %s", job.netloc, ACTIVE_FETCHES[job.netloc])
	if ACTIVE_FETCHES[job.netloc] > allowable:
		log.info("Too many instances of fetchers for domain %s active. Forcing requests to sleep for a while", job.netloc)
		job.ignoreuntiltime = datetime.datetime.now() + datetime.timedelta(seconds=60*5 + random.randrange(0, 60*5))
		db.get_session().commit()
		return
	else:
		with FETCH_LOCK:
			ACTIVE_FETCHES[job.netloc] += 1
		engine.do_job(job)
		time.sleep(5)
		with FETCH_LOCK:
			ACTIVE_FETCHES[job.netloc] -= 1
def status_view():

	session = db.get_session()
	# session.expire()
	tasks = get_scheduled_tasks(session)
	states = session.query(db.PluginStatus).all()
	session.commit()
	return render_template('status.html',
						   tasks          = tasks,
						   states         = states,
						   )
Beispiel #14
0
def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")

	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = db.get_session().query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				db.get_session().add(new)
		db.get_session().commit()
Beispiel #15
0
def fix_tsv():
	step = 1000


	print("Determining extents that need to be changed.")
	start = db.get_session().execute("""SELECT MIN(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL AND id != 60903982;""")
	start = list(start)[0][0]

	end = db.get_session().execute("""SELECT MAX(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL;""")
	end = list(end)[0][0]

	changed = 0
	print("Start: ", start)
	print("End: ", end)


	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		try:
			# SQL String munging! I'm a bad person!
			# Only done because I can't easily find how to make sqlalchemy
			# bind parameters ignore the postgres specific cast
			# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
			have = db.get_session().execute("""UPDATE web_pages SET tsv_content = to_tsvector(coalesce(content)) WHERE tsv_content IS NULL AND content IS NOT NULL AND id < %s AND id >= %s;""" % (x, x-step))
			# print()
			print('%10i, %10i, %7.4f, %6i' % (x, end, (x-start)/(end-start) * 100, have.rowcount))
			changed += have.rowcount
			if changed > step:
				print("Committing (%s changed rows)...." % changed, end=' ')
				db.get_session().commit()
				print("done")
				changed = 0
		except sqlalchemy.exc.OperationalError:
			db.get_session().rollback()
			print("Error!")
			traceback.print_exc()

	db.get_session().commit()
def renderFeedEntry(postid):



	post = db.get_session().query(db.FeedItems) \
		.filter(db.FeedItems.id == postid)    \
		.scalar()

	# relink the feed contents.
	contents = WebMirror.API.replace_links(post.contents)

	return render_template('rss-pages/post.html',
						   entry = post,
						   contents = contents
						   )
def renderFeedsSourceTable(source, page=1):
	feeds = db.get_session().query(db.FeedItems) \
		.filter(db.FeedItems.srcname == source)  \
		.order_by(desc(db.FeedItems.published))

	if feeds is None:
		flash(gettext('No feeds? Something is /probably/ broken!.'))
		return redirect(url_for('renderFeedsTable'))

	feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE'])

	return render_template('rss-pages/feeds.html',
						   subheader = "Source = '%s'" % source,
						   sequence_item   = feed_entries,
						   page            = page
						   )
def fetch_content(query_text, column, page, sources=None):
	session = db.get_session()
	tsq = build_tsquery(query_text)

	if column == db.WebPages.title:
		query = session                                                                                         \
				.query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \
				.filter(                                                                                        \
					func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english')              \
					)                                                                                           \
				.order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc())

	elif column == db.WebPages.tsv_content:
		query = session                                                                                         \
				.query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq)))                              \
				.filter( column.match(tsq) )                                                                    \
				.order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc())

		if sources:
			query = query.filter(db.WebPages.netloc.in_(sources))

	else:
		raise ValueError("Wat?")

	# print(str(query.statement.compile(dialect=postgresql.dialect())))
	# print("param: '%s'" % tsq)

	try:
		entries = paginate(query, page, per_page=50)

	except sqlalchemy.exc.ProgrammingError:
		traceback.print_exc()
		print("ProgrammingError - Rolling back!")
		db.get_session().rollback()
		raise
	except sqlalchemy.exc.InternalError:
		traceback.print_exc()
		print("InternalError - Rolling back!")
		db.get_session().rollback()
		raise
	except sqlalchemy.exc.OperationalError:
		traceback.print_exc()
		print("InternalError - Rolling back!")
		db.get_session().rollback()
		raise

	return entries
def getByNetlocPrefix(netloc, prefix):
	# print("Netloc prefixes")


	query = database.get_session().query(database.WebPages)                 \
		.filter(database.WebPages.is_text == True)                    \
		.filter(database.WebPages.title.like("{}%".format(prefix)))   \
		.filter(database.WebPages.netloc == netloc)                   \
		.order_by(database.WebPages.title)                            \
		.with_entities(database.WebPages.id, database.WebPages.url, database.WebPages.title)


		# .filter(database.WebPages.file == None)                       \
		# .filter(database.WebPages.state == 'complete')                \
		# .group_by(substr_chunk)                                       \

	vals = query.all()
	return vals
Beispiel #20
0
	def clean_files(self):

		session = db.get_session()
		q = session.query(db.WebFiles) \
			.filter(db.WebFiles.fspath != None)

		self.log.info("Querying for non-null filepaths...")
		have = q.all()
		self.log.info("Have %s local files.", len(have))
		count = 0
		for file in have:
			fpath = os.path.join(settings.RESOURCE_DIR, file.fspath)
			if not os.path.exists(fpath):
				self.log.error("Missing file: %s", fpath)

			count += 1
			if count % 1000 == 0:
				self.log.info("Scanned %s files.", count)
def renderFeedsTagTable(tag, page=1):
	query = db.get_session().query(db.FeedItems)
	# query = query.join(db.Tags)
	query = query.filter(db.FeedItems.tags.contains(tag))
	query = query.order_by(desc(db.FeedItems.published))

	feeds = query

	if feeds is None:
		flash(gettext('No feeds? Something is /probably/ broken!.'))
		return redirect(url_for('renderFeedsTable'))

	feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE'])

	return render_template('rss-pages/feeds.html',
						   subheader = "Tag = '%s'" % tag,
						   sequence_item   = feed_entries,
						   page            = page
						   )
def scheduleJobs(sched, timeToStart):

	jobs = []
	offset = 0
	for key, value in activePlugins.scrapePlugins.items():
		baseModule, interval = value
		jobs.append((key, baseModule, interval, timeToStart+datetime.timedelta(seconds=60*offset)))
		offset += 1

	activeJobs = []

	print("JobCaller: ", JobCaller)
	print("JobCaller.callMod: ", JobCaller.callMod)

	for jobId, callee, interval, startWhen in jobs:
		jId = callee.__name__
		print("JobID = ", jId)
		activeJobs.append(jId)
		if not sched.get_job(jId):
			sched.add_job(do_call,
						args               = (callee.__name__, ),
						trigger            = 'interval',
						seconds            = interval,
						start_date         = startWhen,
						id                 = jId,
						max_instances      =  1,
						replace_existing   = True,
						jobstore           = "main_jobstore",
						misfire_grace_time = 2**30)


	for job in sched.get_jobs('main_jobstore'):
		if not job.id in activeJobs:
			sched.remove_job(job.id, 'main_jobstore')

	print("JobSetup call resetting run-states!")
	session = db.get_session()
	session.query(db.PluginStatus).update({db.PluginStatus.is_running : False})
	session.commit()
	db.delete_session()

	print("Run-states reset.")
def getNetlocPrefixes(netloc, length=2):
	# print("Netloc prefixes")
	# SELECT DISTINCT(substring(title for {len})) FROM book_items WHERE lower(title) LIKE %s AND src=%s;

	# So... the .distinct() operator on a query apparently cannot take a function
	# as a parameter, because stupid or something. As such, we hack the query we
	# want together as a return entity
	substr_chunk  = func.substring(func.upper(database.WebPages.title), 0, length)
	# distinct_hack = func.distinct(substr_chunk)

	query = database.get_session().query(database.WebPages)                 \
		.filter(database.WebPages.is_text == True)                    \
		.filter(database.WebPages.file == None)                       \
		.filter(database.WebPages.state == 'complete')                \
		.filter(database.WebPages.netloc == netloc)                   \
		.group_by(substr_chunk)                                       \
		.order_by(substr_chunk)                                       \
		.with_entities(substr_chunk, func.min(database.WebPages.id), func.min(database.WebPages.netloc))

	vals = query.all()
	return vals
	def retrigger_pages(self, releases):
		self.log.info("Total releases found on page: %s. Forcing retrigger of item pages.", len(releases))
		session = db.get_session()
		for release_url in releases:
			while 1:
				try:
					have = session.query(db.WebPages) \
						.filter(db.WebPages.url == release_url)   \
						.scalar()

					# If we don't have the page, ignore
					# it as the normal new-link upsert mechanism
					# will add it.
					if not have:
						self.log.info("New: '%s'", release_url)
						break

					# Also, don't reset if it's in-progress
					if have.state in ['new', 'fetching', 'processing', 'removed']:
						self.log.info("Skipping: '%s' (%s)", release_url, have.state)
						break

					self.log.info("Retriggering page '%s'", release_url)
					have.state = 'new'
					session.commit()
					break


				except sqlalchemy.exc.InvalidRequestError:
					print("InvalidRequest error!")
					session.rollback()
					traceback.print_exc()
				except sqlalchemy.exc.OperationalError:
					print("InvalidRequest error!")
					session.rollback()
				except sqlalchemy.exc.IntegrityError:
					print("[upsertRssItems] -> Integrity error!")
					traceback.print_exc()
					session.rollback()
def renderFeedsTable(page=1):

	feeds = db.get_session().query(db.FeedItems)       \
		.order_by(desc(db.FeedItems.published))


	# feeds = feeds.options(joinedload('tags'))
	# feeds = feeds.options(joinedload('authors'))



	if feeds is None:
		flash(gettext('No feeds? Something is /probably/ broken!.'))
		return redirect(url_for('renderFeedsTable'))

	feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE'])

	return render_template('rss-pages/feeds.html',
						   subheader = "",
						   sequence_item   = feed_entries,
						   page            = page
						   )
	def __init__(self, job_name):

		if not job_name in CALLABLE_LUT:
			raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT))
		self.runModule = CALLABLE_LUT[job_name]
		self.job_name = job_name


		session = db.get_session()

		try:
			query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name)
			have = query.scalar()
			if not have:
				new = db.PluginStatus(plugin_name=job_name)
				session.add(new)
				session.commit()
		except sqlalchemy.exc.OperationalError:
			session.rollback()
		except sqlalchemy.exc.InvalidRequestError:
			session.rollback()

		finally:
			db.delete_session()
Beispiel #27
0
def rss_db_sync(target = None, recent=False):

	json_file = 'rss_filter_misses-1.json'

	write_debug = True
	if target:
		config.C_DO_RABBIT = False
		flags.RSS_DEBUG    = True
		write_debug = False
	else:
		try:
			os.unlink(json_file)
		except FileNotFoundError:
			pass

	import WebMirror.processor.RssProcessor
	parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath   = "Main.RssDb",
															pageUrl     = 'http://www.example.org',
															pgContent   = '',
															type        = 'application/atom+xml',
															transfer    = False,
															debug_print = True,
															write_debug = write_debug)


	print("Getting feed items....")

	if target:
		print("Limiting to '%s' source." % target)
		feed_items = db.get_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == target)    \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()
	elif recent:
		cutoff = datetime.datetime.now() - datetime.timedelta(days=32)
		feed_items = db.get_session().query(db.FeedItems) \
				.filter(db.FeedItems.published > cutoff)  \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)             \
				.all()

	else:
		feed_items = db.get_session().query(db.FeedItems) \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()


	print("Feed items: ", len(feed_items))

	for item in feed_items:
		ctnt = {}
		ctnt['srcname']   = item.srcname
		ctnt['title']     = item.title
		ctnt['tags']      = item.tags
		ctnt['linkUrl']   = item.contenturl
		ctnt['guid']      = item.contentid
		ctnt['published'] = calendar.timegm(item.published.timetuple())

		# Pop()ed off in processFeedData().
		ctnt['contents']  = 'wat'

		try:
			parser.processFeedData(ctnt, tx_raw=False)
		except ValueError:
			pass
		# print(ctnt)
	if target == None:
		sort_json(json_file)