Python get_db_session Exemples, common.database.get_db_session Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_clear_bad():
	'''
	Iterate over all blocked strings from the various YAML rules,
	deleting any occurances of each from the database.
	SLOW
	'''
	from sqlalchemy.dialects import postgresql

	rules = WebMirror.rules.load_rules()

	for ruleset in rules:

		print("Cleaning ruleset")
		# print(ruleset['netlocs'])
		# print(ruleset.keys())
		for badword in ruleset['badwords']:
			if not ruleset['netlocs']:
				continue
			if "%" in badword:
				print(badword)
			else:
				print("Deleting items containing string: '%s'" % badword)
				q = db.get_db_session().query(db.WebPages)                   \
					.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
					.filter(db.WebPages.url.like("%{}%".format(badword)))
				items = q.count()
				if items:
					print("%s results for : '%s'" % (items, badword))

					q = db.get_db_session().query(db.WebPages)                   \
						.filter(db.WebPages.netloc.in_(ruleset['netlocs']))   \
						.filter(db.WebPages.url.like("%{}%".format(badword))) \
						.delete(synchronize_session=False)
					db.get_db_session().commit()

Exemple #2

0

Afficher le fichier

Fichier : RawMirrorManage.py Projet : mighted/ReadableWebProxy

def exposed_purge_raw_invalid_urls_from_history():
	'''
	Delete all raw-archiver rows that aren't
	attached to a archiver module.
	'''

	sess1 = db.get_db_session(postfix='iter_sess')
	sess2 = db.get_db_session(postfix='delete_sess')

	ctbl = version_table(db.RawWebPages.__table__)

	print("Loading files from database...")
	# spinner1 = Spinner()

	est = sess1.execute("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';")
	res = est.scalar()
	print("Estimated row-count: %s" % res)

	last_bad = ""
	deleted = 0
	total_rows = 0
	last_commit = 0
	maxlen = 0
	changed_rows = 0
	with tqdm.tqdm(total=res) as pbar:
		bad = 0

		for rurl, rnetloc in sess1.query(ctbl.c.url, ctbl.c.netloc).yield_per(1000):
			modules_wants_url = any([mod.cares_about_url(rurl) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES])
			has_badwords      = any([badword in rurl for badword in common.global_constants.GLOBAL_BAD_URLS])
			if not modules_wants_url or has_badwords:
				last_bad = rnetloc
				# print("Unwanted: ", rurl)

				changed_rows = sess2.query(ctbl) \
					.filter(ctbl.c.url == rurl) \
					.delete(synchronize_session=False)

				bad += 1
				deleted += 1
			total_rows += 1

			if bad > 5000:
				# print("Committing!")
				bad = 0
				last_commit = deleted
				sess2.commit()
				# pbar.set_description("Doing Commit", refresh=True)
			else:
				msg = "Deleted: %s, since commit: %s, last_bad: '%s' (%s, %s%%)" % \
					(deleted, deleted-last_commit, last_bad, changed_rows, 100.0*(deleted / total_rows))
				maxlen = max(len(msg), maxlen)
				pbar.set_description(msg.ljust(maxlen), refresh=False)


			pbar.update(n=1)

	sess1.commit()
	sess2.commit()

Exemple #3

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_update_feed_names():
	'''
	Apply any new feednamelut names to existing fetched RSS posts.
	'''
	for key, value in feedNameLut.mapper.items():
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == key)    \
				.all()
		if feed_items:
			for item in feed_items:
				item.srcname = value
			print(len(feed_items))
			print(key, value)
			db.get_db_session().commit()

Exemple #4

0

Afficher le fichier

Fichier : runScheduler.py Projet : fake-name/ReadableWebProxy

def resetRunStates():
	print("JobSetup call resetting run-states!")
	session = db.get_db_session()
	session.query(db.PluginStatus).update({db.PluginStatus.is_running : False})
	session.commit()
	db.delete_db_session()
	print("Run-states reset.")

Exemple #5

0

Afficher le fichier

Fichier : Flatten.py Projet : fake-name/ReadableWebProxy

	def consolidate_history(self):

		sess = db.get_db_session()
		self.qlog.info("Querying for items with significant history size")
		end = sess.execute("""
				SELECT
					count(*), url
				FROM
					web_pages_version
				GROUP BY
					url
				HAVING
					COUNT(*) > 10
				ORDER BY url
			""")
		end = list(end)
		self.qlog.info("Found %s items with more then 10 history entries. Processing", len(end))

		for count, url in end:
			while 1:
				try:
					self.truncate_url_history(url)
					break
				except sqlalchemy.exc.OperationalError:
					self.sess.rollback()

Exemple #6

0

Afficher le fichier

Fichier : offline_filters.py Projet : fake-name/ReadableWebProxy

def exposed_delete_nu_unresolved():
	'''
	Delete all nu head system rows that have not been reviewed.

	This is needed for historical purges, particularly if
	nu changes their extnu ids.
	'''
	sess = db.get_db_session()

	count = 0

	for row in sess.query(db.NuReleaseItem) \
		.yield_per(50).all():

		if len(list(row.resolved)) != 3 and row.reviewed == 'unverified':

			print(row.id, len(list(row.resolved)), row.referrer)
			for bad in row.resolved:
				sess.delete(bad)
			sess.delete(row)
			count += 1
			# if count % 500 == 0:
			# 	print("Committing!")
			# 	sess.commit()

	print("Committing!")
	sess.commit()

Exemple #7

0

Afficher le fichier

def exposed_delete_spcnet_invalid_url_pages():
	'''
	So the spcnet.tv forum software generates THOUSANDS of garbage links somehow.
	Anyways, delete those.
	'''
	sess = db.get_db_session()
	tables = [
		db.WebPages.__table__,
		version_table(db.WebPages)
	]

	for ctbl in tables:
		# Print Querying for affected rows
		q = sess.query(ctbl.c.id) \
			.filter(ctbl.c.netloc == "www.spcnet.tv") \
			.filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%'))
		print("Query:")
		print(q)
		ids = q.all()

		ids = set(ids)

		# Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs.
		ids = [tmp[0] for tmp in ids]

		print("Fount %s rows requring deletion. Deleting." % len(ids))
		delete_internal(sess, ids)
		sess.commit()

Exemple #8

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_delete_feed(feed_name, do_delete, search_str):
	'''
	Feed name is the readable name of the feed, from feedNameLut.py.
	do delete is a boolean that determines if the deletion is actually done, or the actions are
		just previewed. Unless do_delete.lower() == "true", no action will actually be
		taken.
	search_str is the string of items to search for. Searches are case sensitive, and the only
		component of the feed that are searched within is the title.
		search_str is split on the literal character "|", for requiring multiple substrings
		be in the searched title.

	Delete the rss entries for a feed, using a search key.

	'''

	sess = db.get_db_session()
	items = sess.query(db.FeedItems)               \
		.filter(db.FeedItems.srcname == feed_name) \
		.all()

	do_delete = "true" in do_delete.lower()

	searchitems = search_str.split("|")
	for item in items:
		itemall = " ".join([item.title] + item.tags)
		if all([searchstr in itemall for searchstr in searchitems]):
			print(itemall)
			if do_delete:
				print("Deleting item")
				sess.delete(item)

	sess.commit()

Exemple #9

0

Afficher le fichier

	def doCall(self):

		self.log.info("Calling job %s", self.job_name)
		session = db.get_db_session()
		item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one()
		if item.is_running:
			session.commit()
			self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name)
			return

		item.is_running = True
		item.last_run = datetime.datetime.now()
		session.commit()

		try:
			self._doCall()
		except Exception:
			item.last_error      = datetime.datetime.now()
			item.last_error_msg  = traceback.format_exc()
			raise
		finally:

			item2 = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one()
			item2.is_running = False
			item2.last_run_end = datetime.datetime.now()
			session.commit()
			db.delete_db_session()
		self.log.info("Job %s complete.", self.job_name)

Exemple #10

0

Afficher le fichier

Fichier : runScheduler.py Projet : fake-name/ReadableWebProxy

	def doCall(self):

		self.log.info("Calling job %s", self.job_name)
		session = db.get_db_session()
		item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one()
		if item.is_running:
			session.commit()
			self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name)
			return

		item.is_running = True
		item.last_run = datetime.datetime.now()
		session.commit()

		try:
			self._doCall()
		except Exception:
			item.last_error      = datetime.datetime.now()
			item.last_error_msg  = traceback.format_exc()
			raise
		finally:

			item2 = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one()
			item2.is_running = False
			item2.last_run_end = datetime.datetime.now()
			session.commit()
			db.delete_db_session()
		self.log.info("Job %s complete.", self.job_name)

Exemple #11

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_delete_comment_feed_items():
	'''
	Iterate over all retreived feed article entries, and delete any that look
	like they're comment feed articles.
	'''
	sess = db.get_db_session()
	bad = sess.query(db.FeedItems) \
			.filter(or_(
				db.FeedItems.contenturl.like("%#comment-%"),
				db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"),
				db.FeedItems.contenturl.like("%www.fanfiction.net%"),
				db.FeedItems.contenturl.like("%www.fictionpress.com%"),
				db.FeedItems.contenturl.like("%www.booksie.com%")))    \
			.order_by(db.FeedItems.contenturl) \
			.all()

	count = 0
	for bad in bad:
		print(bad.contenturl)

		while bad.author:
			bad.author.pop()
		while bad.tags:
			bad.tags.pop()
		sess.delete(bad)
		count += 1
		if count % 1000 == 0:
			print("Committing at %s" % count)
			sess.commit()

	print("Done. Committing...")
	sess.commit()

Exemple #12

0

Afficher le fichier

def qidianSmartFeedFetch(params, rid, joburl, netloc, job_aggregator_instance):
    print('qidianSmartFeedFetch', params, rid, joburl, netloc)

    sess = db.get_db_session(flask_sess_if_possible=False)
    have = sess.query(db.QidianFeedPostMeta).order_by(
        desc(db.QidianFeedPostMeta.id)).limit(500).all()

    meta_dict = {}
    for row in have:
        meta_dict[row.contentid] = row.meta

    sess.commit()

    raw_job = WebMirror.JobUtils.buildjob(
        module='SmartWebRequest',
        call='qidianSmartFeedFetch',
        dispatchKey="fetcher",
        jobid=rid,
        args=[joburl],
        kwargs={'meta': meta_dict},
        additionalData={},
        postDelay=0,
        serialize="QidianModule",
    )

    # print("Raw job:")
    # print(raw_job)
    # return raw_job

    job_aggregator_instance.put_job(raw_job)

Exemple #13

0

Afficher le fichier

Fichier : Runner.py Projet : fake-name/ReadableWebProxy

def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")
	sess = db.get_db_session()
	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = sess.query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except sqlalchemy.SQLAlchemyError:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	db.delete_db_session()

Exemple #14

0

Afficher le fichier

Fichier : Runner.py Projet : sikopet/ReadableWebProxy

def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")
	sess = db.get_db_session()
	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = sess.query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except sqlalchemy.SQLAlchemyError:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	db.delete_db_session()

Exemple #15

0

Afficher le fichier

Fichier : FeedDbManage.py Projet : sikopet/ReadableWebProxy

def exposed_dump_raw_feed_data():
    '''
	Dump the raw feed data to a json file.
	'''
    import json

    sess = db.get_db_session()
    print("Selecting 1")
    feed_pages = sess.execute("SELECT * FROM feed_pages;")
    print("Selecting 2")
    nu_outbound_wrappers = sess.execute("SELECT * FROM nu_outbound_wrappers;")

    ret = {}
    print("processing ret 1")
    cols_feed = ('id', 'type', 'srcname', 'feedurl', 'contenturl', 'contentid',
                 'title', 'contents', 'updated', 'published', 'feed_id')
    ret['feed_pages'] = ret_to_dict_list(cols_feed, feed_pages)

    print("processing ret 2")
    nucols = [
        'id', 'actual_target', 'client_id', 'client_key', 'groupinfo',
        'outbound_wrapper', 'referrer', 'releaseinfo', 'seriesname',
        'validated', 'released_on'
    ]
    ret['nu_outbound_wrappers'] = ret_to_dict_list(nucols,
                                                   nu_outbound_wrappers)

    print("Dumping ret")

    with open(
            "db_bak_{}.json".format(
                str(datetime.datetime.now()).replace(":",
                                                     "-").replace(" ", "_")),
            "w") as fp:
        json.dump(ret, fp, indent="	")

Exemple #16

0

Afficher le fichier

Fichier : FeedDbManage.py Projet : sikopet/ReadableWebProxy

def exposed_astor_roundtrip_parser_functions():
    '''
	Shove the feed-functions through the astor "round-trip"
	facility.

	Mostly, this homogenizes the indentation, and reformats the function.
	'''

    sess = db.get_db_session()
    res = sess.query(db.RssFeedEntry) \
     .all()

    for row in res:
        func = row.get_func()
        _ast = row._get_ast()
        src = astor.to_source(_ast,
                              indent_with="	",
                              pretty_source=better_pretty_source)

        if src.strip() != row.func.strip():
            try:
                rfdb.str_to_function(src, "testing_compile")
                print("Compiled OK")
                row.func = src
            except Exception:
                print("Compilation failed?")
    sess.commit()

Exemple #17

0

Afficher le fichier

Fichier : offline_filters.py Projet : sikopet/ReadableWebProxy

def exposed_delete_nu_unresolved():
	'''
	Delete all nu head system rows that have not been reviewed.

	This is needed for historical purges, particularly if
	nu changes their extnu ids, or if the url masking
	mechanism has significant changes.
	'''
	sess = db.get_db_session()

	count = 0
	print("Loading rows....")
	rows = sess.query(db.NuReleaseItem) \
		.options(joinedload('resolved'))    \
		.all()
	print("Loaded %s rows. Scanning." % len(rows))
	for row in rows:

		if len(list(row.resolved)) == 0 and row.reviewed == 'unverified':

			print(row.id, len(list(row.resolved)), row.referrer)
			for bad in row.resolved:
				sess.delete(bad)
			sess.delete(row)
			count += 1
			if count % 500 == 0:
				print("Committing!")
				sess.commit()

	print("Committing!")
	sess.commit()

Exemple #18

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_longest_rows():
	'''
	Fetch the rows from the database where the `content` field is longest.
	Return is limited to the biggest 50 rows.
	VERY SLOW (has to scan the entire table)
	'''
	print("Getting longest rows from database")
	have = db.get_db_session().execute("""
		SELECT
			id, url, length(content), content
		FROM
			web_pages
		ORDER BY
			LENGTH(content) DESC NULLS LAST
		LIMIT 50;
		""")
	print("Rows:")

	import os
	import os.path

	savepath = "./large_files/"
	for row in have:
		print(row[0], row[1])
		try:
			os.makedirs(savepath)
		except FileExistsError:
			pass
		with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp:
			urlst = "URL: %s\n\n" % row[1]
			size = "Length: %s\n\n" % row[2]
			fp.write(urlst.encode("utf-8"))
			fp.write(size.encode("utf-8"))
			fp.write("{}".format(row[3]).encode("utf-8"))

Exemple #19

0

Afficher le fichier

def resetRunStates():
	print("JobSetup call resetting run-states!")
	session = db.get_db_session()
	session.query(db.PluginStatus).update({db.PluginStatus.is_running : False})
	session.commit()
	db.delete_db_session()
	print("Run-states reset.")

Exemple #20

0

Afficher le fichier

    def __init__(self, connect=True):
        super().__init__()

        self.name_lut, self.group_lut = load_lut()
        self.db_sess = db.get_db_session(postfix='nu_header')

        if connect:
            self.check_open_rpc_interface()

Exemple #21

0

Afficher le fichier

Fichier : NuHeader.py Projet : fake-name/ReadableWebProxy

	def __init__(self, connect=True):
		super().__init__()

		self.name_lut, self.group_lut = load_lut()
		self.db_sess = db.get_db_session(postfix='nu_header')

		if connect:
			self.check_open_rpc_interface()

Exemple #22

0

Afficher le fichier

Fichier : offline_filters.py Projet : sikopet/ReadableWebProxy

def exposed_process_nu_pages(transmit=True):
	'''
	Re-process all locally saved novelupdates pages.
	'''


	wg = common.util.webFunctions.WebGetRobust()
	sess = db.get_db_session()

	if transmit == True:
		print("Transmitting processed results")
		rm = common.RunManager.Crawler(1, 1)
		message_q = rm.start_aggregator()
	else:
		print("Not translating processed results")
		message_q = queue.Queue()

	pages = []
	print("Beginning DB retreival")
	for row in sess.query(db.WebPages) \
		.filter(db.WebPages.netloc == "www.novelupdates.com") \
		.filter(db.WebPages.url.ilike("%/series/%")) \
		.yield_per(50).all():

		rowtmp = {
			"pageUrl"   : row.url,
			"pgContent" : row.content,
			"type"      : row.mimetype,
			"wg"        : wg,
			"message_q" : message_q,
		}
		pages.append(rowtmp)

		if len(pages) % 100 == 0:
			print("Loaded %s pages..." % len(pages))
	sess.flush()
	sess.commit()
	for row in pages:
		try:
			# print(row, row.url, row.state)
			if row['pgContent'] and NuSeriesPageFilter.NUSeriesPageProcessor.wantsUrl(row['pageUrl']):
				proc = NuSeriesPageFilter.NUSeriesPageProcessor(db_sess=sess, **row)
				proc.extractContent()
		except Exception:
			print("")
			print("ERROR!")
			for line in traceback.format_exc().split("\n"):
				print(line.rstrip())
			print("")
		except KeyboardInterrupt:
			break

	runStatus.run_state.value = 0

	if transmit == True:
		rm.join_aggregator()

	print(sess)

Exemple #23

0

Afficher le fichier

Fichier : Engine.py Projet : rrosajp/ReadableWebProxy

	def wg(self):
		if getattr(self, '_SiteArchiver__wg', None) is None:
			print("Creating WG Interface!")
			alt_cj = dbCj.DatabaseCookieJar(db=self.db, session=db.get_db_session(postfix="_cookie_interface"))
			self.__wg = WebRequest.WebGetRobust(
					use_socks     = self.__wr_use_socks,
					alt_cookiejar = alt_cj,
					custom_ua     = self.__wr_ua_override,
				)
		return self.__wg

Exemple #24

0

Afficher le fichier

Fichier : file_cleanup.py Projet : sikopet/ReadableWebProxy

def sync_raw_with_filesystem():

    sess = db.get_db_session()

    print("Loading files from database...")
    spinner1 = Spinner()
    in_db = []
    for row in sess.query(db.RawWebPages).yield_per(1000):
        if row.fspath:
            in_db.append(row.fspath)
            spinner1.next(vlen=len(row.fspath))
        else:
            spinner1.next(star=True)

    in_db = set(in_db)

    tgtpath = settings.RAW_RESOURCE_DIR
    print("")
    print("Enumerating files from disk...")
    agg_files = []
    have_files = []
    spinner2 = Spinner()
    for root, dirs, files in os.walk(tgtpath):
        for filen in files:
            fqpath = os.path.join(root, filen)
            fpath = fqpath[len(tgtpath) + 1:]

            if fpath in in_db:
                spinner2.next(star=True, vlen=0)
                have_files.append(fpath)
            else:
                spinner2.next(vlen=1)
                agg_files.append(fpath)
                fqpath = os.path.join(tgtpath, fpath)
                os.unlink(fqpath)
                print("\rDeleting: %s  " % fqpath)

    print()
    print("Found %s files (%s unique)" % (len(agg_files), len(set(agg_files))))

    missing_files = set(in_db) - set(have_files)

    for filen in agg_files:
        print("Should delete: '%s'" % filen)
    for filen in missing_files:
        print("Missing: '%s'" % filen)

        sess.query(
            db.RawWebPages).filter(db.RawWebPages.fspath == filen).update({
                "state":
                "new",
                "fspath":
                None
            })
        sess.commit()

Exemple #25

0

Afficher le fichier

Fichier : offline_filters.py Projet : fake-name/ReadableWebProxy

def exposed_process_nu_pages(transmit=True):
	'''
	Re-process all locally saved novelupdates pages.
	'''


	wg = common.util.webFunctions.WebGetRobust()
	sess = db.get_db_session()

	if transmit == True:
		rm = common.RunManager.Crawler(1, 1)
		message_q = rm.start_aggregator()
	else:
		message_q = queue.Queue()

	pages = []
	for row in sess.query(db.WebPages) \
		.filter(db.WebPages.netloc == "www.novelupdates.com") \
		.yield_per(50).all():

		rowtmp = {
			"pageUrl"   : row.url,
			"pgContent" : row.content,
			"type"      : row.mimetype,
			"wg"        : wg,
			"message_q" : message_q,
		}
		pages.append(rowtmp)

		if len(pages) == 100:
			print("Loaded %s pages..." % len(pages))
	sess.flush()
	sess.commit()
	for row in pages:
		try:
			# print(row, row.url, row.state)
			if row['pgContent'] and NuSeriesPageFilter.NUSeriesPageProcessor.wantsUrl(row['pageUrl']):
				proc = NuSeriesPageFilter.NUSeriesPageProcessor(db_sess=sess, **row)
				proc.extractContent()
		except Exception:
			print("")
			print("ERROR!")
			for line in traceback.format_exc().split("\n"):
				print(line.rstrip())
			print("")
		except KeyboardInterrupt:
			break

	runStatus.run_state.value = 0

	if transmit == True:
		rm.join_aggregator()

	print(sess)

Exemple #26

0

Afficher le fichier

Fichier : RawMirrorManage.py Projet : mighted/ReadableWebProxy

def exposed_reset_raw_missing():
	'''
	Retrigger all raw-archive links that don't seem to have
	a corresponding file on-disk.
	'''

	sess = db.get_db_session()

	bad = 0
	for row in sess.query(db.RawWebPages).yield_per(1000):
		if row.fspath:


			nl, rest = row.fspath.split("/", 1)
			nl = nl.split(".")
			nl.reverse()
			nl = "/".join(nl)
			newp = nl + "/" + rest

			old = os.path.join(C_RAW_RESOURCE_DIR, "old", row.fspath)
			new = os.path.join(C_RAW_RESOURCE_DIR, newp)

			if os.path.exists(new) and row.fspath == newp:
				#print("Nothing to do: ", row.fspath, new, newp)
				pass
			elif os.path.exists(new):
				print("Relinking: ", newp, row.fspath)
				row.fspath = to_locpath(new)
				bad += 1
			elif os.path.exists(old):
				dirPath = os.path.split(new)[0]
				if not os.path.exists(dirPath):
					os.makedirs(dirPath)
				shutil.move(old, new)

				row.fspath = to_locpath(new)
				bad += 1
				print("Moving: ", old, new)
			else:
				row.state = "new"
				bad += 1
		else:
			row.state = "new"
			bad += 1

		if bad > 25000:
			print("Committing!")
			bad = 0
			sess.commit()
	sess.commit()

Exemple #27

0

Afficher le fichier

Fichier : Runner.py Projet : fake-name/ReadableWebProxy

	def do_task(self):

		db_handle = db.get_db_session()

		hadjob = False
		try:
			self.archiver = WebMirror.Engine.SiteArchiver(self.cookie_lock, new_job_queue=self.new_job_queue, response_queue=self.resp_queue, db_interface=db_handle)
			hadjob = self.archiver.taskProcess()
		finally:
			# Clear out the sqlalchemy state
			db_handle.expunge_all()
			db.delete_db_session()

		return hadjob

Exemple #28

0

Afficher le fichier

Fichier : RawMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_reset_raw_missing():
	'''
	Retrigger all raw-archive links that don't seem to have
	a corresponding file on-disk.
	'''

	sess = db.get_db_session()

	bad = 0
	for row in sess.query(db.RawWebPages).yield_per(1000).all():
		if row.fspath:


			nl, rest = row.fspath.split("/", 1)
			nl = nl.split(".")
			nl.reverse()
			nl = "/".join(nl)
			newp = nl + "/" + rest

			old = os.path.join(C_RAW_RESOURCE_DIR, "old", row.fspath)
			new = os.path.join(C_RAW_RESOURCE_DIR, newp)

			if os.path.exists(new) and row.fspath == newp:
				#print("Nothing to do: ", row.fspath, new, newp)
				pass
			elif os.path.exists(new):
				print("Relinking: ", newp, row.fspath)
				row.fspath = to_locpath(new)
				bad += 1
			elif os.path.exists(old):
				dirPath = os.path.split(new)[0]
				if not os.path.exists(dirPath):
					os.makedirs(dirPath)
				shutil.move(old, new)

				row.fspath = to_locpath(new)
				bad += 1
				print("Moving: ", old, new)
			else:
				row.state = "new"
				bad += 1
		else:
			row.state = "new"
			bad += 1

		if bad > 25000:
			print("Committing!")
			bad = 0
			sess.commit()
	sess.commit()

Exemple #29

0

Afficher le fichier

Fichier : Runner.py Projet : sikopet/ReadableWebProxy

	def do_task(self):

		db_handle = db.get_db_session()

		hadjob = False
		try:
			self.archiver = WebMirror.Engine.SiteArchiver(self.cookie_lock, new_job_queue=self.new_job_queue, response_queue=self.resp_queue, db_interface=db_handle)
			hadjob = self.archiver.taskProcess()
		finally:
			# Clear out the sqlalchemy state
			db_handle.expunge_all()
			db.delete_db_session()

		return hadjob

Exemple #30

0

Afficher le fichier

Fichier : file_cleanup.py Projet : fake-name/ReadableWebProxy

def sync_raw_with_filesystem():

	sess = db.get_db_session()

	print("Loading files from database...")
	spinner1 = Spinner()
	in_db = []
	for row in sess.query(db.RawWebPages).yield_per(1000):
		if row.fspath:
			in_db.append(row.fspath)
			spinner1.next(vlen=len(row.fspath))
		else:
			spinner1.next(star=True)

	in_db = set(in_db)

	tgtpath = settings.RAW_RESOURCE_DIR
	print("")
	print("Enumerating files from disk...")
	agg_files = []
	have_files = []
	spinner2 = Spinner()
	for root, dirs, files in os.walk(tgtpath):
		for filen in files:
			fqpath = os.path.join(root, filen)
			fpath = fqpath[len(tgtpath)+1:]

			if fpath in in_db:
				spinner2.next(star=True, vlen=0)
				have_files.append(fpath)
			else:
				spinner2.next(vlen=1)
				agg_files.append(fpath)
				fqpath = os.path.join(tgtpath, fpath)
				os.unlink(fqpath)
				print("\rDeleting: %s  " % fqpath)

	print()
	print("Found %s files (%s unique)" % (len(agg_files), len(set(agg_files))))

	missing_files = set(in_db) - set(have_files)

	for filen in agg_files:
		print("Should delete: '%s'" % filen)
	for filen in missing_files:
		print("Missing: '%s'" % filen)

		sess.query(db.RawWebPages).filter(db.RawWebPages.fspath == filen).update({"state" : "new", "fspath" : None})
		sess.commit()

Exemple #31

0

Afficher le fichier

Fichier : NUHomepageFilter.py Projet : collegroup/ReadableWebProxy

def reset_homepages():
    import tqdm
    import common.database as db

    sess = db.get_db_session()
    for pageno in tqdm.trange(1, 1001):
        url = "https://www.novelupdates.com/?pg=%d" % pageno
        have = sess.query(db.WebPages)                                     \
         .filter(db.WebPages.url==url) \
         .scalar()

        if have:
            have.state = 'new'
            have.epoch = 0
            sess.commit()

Exemple #32

0

Afficher le fichier

Fichier : Runner.py Projet : sikopet/ReadableWebProxy

def resetInProgress():
	print("Resetting any stalled downloads from the previous session.")

	sess = db.get_db_session()
	sess.query(db.WebPages) \
		.filter(
				(db.WebPages.state == "fetching")           |
				(db.WebPages.state == "processing")         |
				(db.WebPages.state == "specialty_deferred") |
				(db.WebPages.state == "specialty_ready")
				)   \
		.update({db.WebPages.state : "new"})
	sess.commit()
	sess.close()
	db.delete_db_session()

Exemple #33

0

Afficher le fichier

Fichier : GravityTalesManage.py Projet : sikopet/ReadableWebProxy

def exposed_delete_gravitytales_bot_blocked_pages():
    '''
	Delete the "checking you're not a bot" garbage pages
	that sometimes get through the gravitytales scraper.
	'''
    sess = db.get_db_session()
    tables = [db.WebPages.__table__, version_table(db.WebPages)]

    for ctbl in tables:
        update = ctbl.delete() \
         .where(ctbl.c.netloc == "gravitytales.com") \
         .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%'))
        print(update)
        sess.execute(update)
        sess.commit()

Exemple #34

0

Afficher le fichier

Fichier : Runner.py Projet : fake-name/ReadableWebProxy

def resetInProgress():
	print("Resetting any stalled downloads from the previous session.")

	sess = db.get_db_session()
	sess.query(db.WebPages) \
		.filter(
				(db.WebPages.state == "fetching")           |
				(db.WebPages.state == "processing")         |
				(db.WebPages.state == "specialty_deferred") |
				(db.WebPages.state == "specialty_ready")
				)   \
		.update({db.WebPages.state : "new"})
	sess.commit()
	sess.close()
	db.delete_db_session()

Exemple #35

0

Afficher le fichier

Fichier : FeedDbManage.py Projet : sikopet/ReadableWebProxy

def exposed_import_feed_parse_funcs():
    '''
	Import the feed parsing functions into the database.
	'''

    sess = db.get_db_session()

    # parse_map = WebMirror.OutputFilters.rss.FeedDataParser.RSS_PARSE_FUNCTION_MAP
    # for key, func in parse_map.items():
    # 	func_str = astor.to_source(astor.code_to_ast(func), indent_with="	")
    # 	update_func(sess, key, func_str)

    name_map = WebMirror.OutputFilters.util.feedNameLut.mapper

    for key, val in name_map.items():
        add_name(sess, key, val)

Exemple #36

0

Afficher le fichier

Fichier : Updater.py Projet : sikopet/ReadableWebProxy

    def get_times(self):
        conn = database.get_db_session()
        aps = conn.execute("SELECT job_state FROM apscheduler_jobs;")

        update_times = []
        for blob, in aps:
            job_dict = pickle.loads(blob)
            update_times.append(
                (job_dict['id'], job_dict['next_run_time'].isoformat()))

        data = {
            "update-times": update_times,
        }
        database.delete_db_session()

        return pack_message("system-update-times", data)

Exemple #37

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_raw_test_retrieve(url):
	'''
	Lower level fetch test, otherwise similar to `test_retreive`
	'''

	# try:
	# 	WebMirror.SpecialCase.startAmqpFetcher()
	# except RuntimeError:  # Fetcher already started
	# 	pass


	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))


	sess = db.get_db_session()

	row = sess.query(db.RawWebPages).filter(db.RawWebPages.url == url).scalar()
	if row:
		row.state = 'new'
	else:
		row = db.RawWebPages(
			url       = url,
			starturl  = root,
			netloc    = parsed.netloc,
			distance  = 50000,
			priority  = 500000,
			state     = 'new',
			fetchtime = datetime.datetime.now(),
			)
		sess.add(row)


	try:
		archiver = RawArchiver.RawEngine.RawSiteArchiver(
			total_worker_count = 1,
			worker_num         = 0,
			new_job_queue      = None,
			cookie_lock        = None,
			db_interface       = sess,
			response_queue     = None
			)
		job     = archiver.do_job(row)
	except Exception as e:
		traceback.print_exc()
	finally:
		db.delete_db_session()

Exemple #38

0

Afficher le fichier

Fichier : Clean.py Projet : fake-name/ReadableWebProxy

    def clean_files(self):

        session = db.get_db_session()
        q = session.query(db.WebFiles).filter(db.WebFiles.fspath != None)

        self.log.info("Querying for non-null filepaths...")
        have = q.all()
        self.log.info("Have %s local files.", len(have))
        count = 0
        for file in have:
            fpath = os.path.join(settings.RESOURCE_DIR, file.fspath)
            if not os.path.exists(fpath):
                self.log.error("Missing file: %s", fpath)

            count += 1
            if count % 1000 == 0:
                self.log.info("Scanned %s files.", count)

Exemple #39

0

Afficher le fichier

Fichier : Runner.py Projet : fake-name/ReadableWebProxy

	def launch_agg(cls, agg_queue):
		try:
			common.stuck.install_pystuck()
			agg_db = db.get_db_session()
			instance = cls(agg_queue, agg_db)
			instance.run()
			instance.close()
		except Exception as e:
			import traceback
			print()
			print()
			print()
			print()
			print()
			print()
			print("Aggregator exception!")
			traceback.print_exc()

Exemple #40

0

Afficher le fichier

Fichier : Runner.py Projet : sikopet/ReadableWebProxy

	def launch_agg(cls, agg_queue):

		try:
			common.stuck.install_pystuck()
			agg_db = db.get_db_session()
			instance = cls(agg_queue, agg_db)
			instance.run()
			instance.close()
		except Exception as e:
			import traceback
			print()
			print()
			print()
			print()
			print()
			print()
			print("Aggregator exception!")
			traceback.print_exc()

Exemple #41

0

Afficher le fichier

Fichier : Misc.py Projet : sikopet/ReadableWebProxy

def exposed_print_scheduled_jobs():
	'''

	'''
	sess = db.get_db_session()

	items = sess.execute("""
		SELECT
			id, next_run_time , job_state
		FROM
			apscheduler_jobs
	""")
	items = list(items)
	for tid, nextcall, content in items:
		print("Job: ", tid.ljust(30), str(nextcall).rjust(20))

		dat = pickle.loads(content)
		pprint.pprint(dat)

Exemple #42

0

Afficher le fichier

Fichier : Updater.py Projet : fake-name/ReadableWebProxy

	def get_times(self):
		conn = database.get_db_session()
		aps = conn.execute("SELECT job_state FROM apscheduler_jobs;")

		update_times = []
		for blob, in aps:
			job_dict = pickle.loads(blob)
			update_times.append((
					job_dict['id'],
					job_dict['next_run_time'].isoformat()
				))

		data = {
			"update-times" : update_times,
		}
		database.delete_db_session()

		return pack_message("system-update-times", data)

Exemple #43

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_drop_priorities():
	'''
	Reset the priority of every row in the table to the IDLE_PRIORITY level
	'''

	step  = 10000

	sess = db.get_db_session()
	print("Getting minimum row in need or update..")
	start = sess.execute("""SELECT min(id) FROM web_pages WHERE priority < 500000""")
	start = list(start)[0][0]
	print("Minimum row ID: ", start, "getting maximum row...")
	stop = sess.execute("""SELECT max(id) FROM web_pages WHERE priority < 500000""")
	stop = list(stop)[0][0]
	print("Maximum row ID: ", stop)

	if not start:
		print("No null rows to fix!")
		return

	print("Need to fix rows from %s to %s" % (start, stop))
	start = start - (start % step)

	changed = 0
	for idx in range(start, stop, step):
		# SQL String munging! I'm a bad person!
		# Only done because I can't easily find how to make sqlalchemy
		# bind parameters ignore the postgres specific cast
		# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
		have = sess.execute("""update web_pages set priority = 500000 where priority < 500000 AND id > {} AND id <= {};""".format(idx, idx+step))
		# print()

		processed  = idx - start
		total_todo = stop - start
		print('%10i, %10i, %7.4f, %6i' % (idx, stop, processed/total_todo * 100, have.rowcount))
		changed += have.rowcount
		if changed > 100000:
			print("Committing (%s changed rows)...." % changed, end=' ')
			sess.commit()
			print("done")
			changed = 0

	sess.commit()

Exemple #44

0

Afficher le fichier

Fichier : RawMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_purge_raw_invalid_urls():
	'''
	Delete all raw-archiver rows that aren't
	attached to a archiver module.
	'''

	sess = db.get_db_session()

	bad = 0
	for row in sess.query(db.RawWebPages).yield_per(1000).all():
		if not any([mod.cares_about_url(row.url) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES]):
			print("Unwanted: ", row.url)
			sess.delete(row)
			bad += 1
		if bad > 5000:
			print("Committing!")
			bad = 0
			sess.commit()
	sess.commit()

Exemple #45

0

Afficher le fichier

Fichier : FeedDbManage.py Projet : sikopet/ReadableWebProxy

def exposed_underp_rss_functions():
    '''
	Do stupid fixes to the RSS database.
	'''
    bad = '''	if not (chp or vol) or 'preview' in item['title'].lower():
		return False'''
    good = '''	if not (chp or vol) or 'preview' in item['title'].lower():
		return None'''

    sess = db.get_db_session()

    rows = sess.query(db.RssFeedEntry).all()
    for row in rows:
        if bad in row.func:
            row.func = row.func.replace(bad, good)
            print(row)
            print(row.func)
    sess.commit()
    pass

Exemple #46

0

Afficher le fichier

Fichier : offline_filters.py Projet : sikopet/ReadableWebProxy

def exposed_delete_old_nu_root_outbound():
	'''
	Delete NU outbound links that use the homepage as their referrer.

	Apparently NU was validating the referrer to see if the referring page actually had
	the referring link on it, or /something/.

	Anyways, it's easier to generate a permanent referrer by just pointing it at
	the series page.
	'''


	sess = db.get_db_session()

	for row in sess.query(db.NuReleaseItem) \
		.filter(not_(db.NuReleaseItem.referrer.like("%novelupdates.com/series%"))) \
		.yield_per(50).all():
		if not len(list(row.resolved)):
			print(row.id, row.referrer)
			sess.delete(row)
			sess.commit()

Exemple #47

0

Afficher le fichier

Fichier : offline_filters.py Projet : fake-name/ReadableWebProxy

def exposed_delete_old_nu_root_outbound():
	'''
	Delete NU outbound links that use the homepage as their referrer.

	Apparently NU was validating the referrer to see if the referring page actually had
	the referring link on it, or /something/.

	Anyways, it's easier to generate a permanent referrer by just pointing it at
	the series page.
	'''


	sess = db.get_db_session()

	for row in sess.query(db.NuReleaseItem) \
		.filter(not_(db.NuReleaseItem.referrer.like("%novelupdates.com/series%"))) \
		.yield_per(50).all():
		if not len(list(row.resolved)):
			print(row.id, row.referrer)
			sess.delete(row)
			sess.commit()

Exemple #48

0

Afficher le fichier

Fichier : file_cleanup.py Projet : fake-name/ReadableWebProxy

def sync_filtered_with_filesystem():
	tgtpath = settings.RESOURCE_DIR

	sess = db.get_db_session()

	print("Loading files from database...")
	spinner1 = Spinner()
	in_db = []
	chunk_cnt = 0
	for row in sess.query(db.WebFiles).yield_per(10000):
		chunk_cnt += 1
		if row.fspath:
			in_db.append(row.fspath)
			spinner1.next(vlen=len(row.fspath), output=(chunk_cnt == 10))
			if chunk_cnt == 40:
				chunk_cnt = 0

	origl = len(in_db)
	in_db = set(in_db)

	print("")
	print("%s files, %s unique" % (origl, len(in_db)))
	print("Enumerating files from disk...")
	agg_files = []
	have_files = []
	spinner2 = Spinner()
	for root, dirs, files in os.walk(tgtpath):
		for filen in files:
			fqpath = os.path.join(root, filen)
			fpath = fqpath[len(tgtpath)+1:]

			if fpath in in_db:
				spinner2.next(star=True, vlen=0)
				have_files.append(fpath)
			else:
				spinner2.next(vlen=1)
				agg_files.append(fpath)
				fqpath = os.path.join(tgtpath, fpath)
				# os.unlink(fqpath)
				print("\rDeleting: %s  " % fqpath)

Exemple #49

0

Afficher le fichier

Fichier : file_cleanup.py Projet : sikopet/ReadableWebProxy

def sync_filtered_with_filesystem():
    tgtpath = settings.RESOURCE_DIR

    sess = db.get_db_session()

    print("Loading files from database...")
    spinner1 = Spinner()
    in_db = []
    chunk_cnt = 0
    for row in sess.query(db.WebFiles).yield_per(10000):
        chunk_cnt += 1
        if row.fspath:
            in_db.append(row.fspath)
            spinner1.next(vlen=len(row.fspath), output=(chunk_cnt == 10))
            if chunk_cnt == 40:
                chunk_cnt = 0

    origl = len(in_db)
    in_db = set(in_db)

    print("")
    print("%s files, %s unique" % (origl, len(in_db)))
    print("Enumerating files from disk...")
    agg_files = []
    have_files = []
    spinner2 = Spinner()
    for root, dirs, files in os.walk(tgtpath):
        for filen in files:
            fqpath = os.path.join(root, filen)
            fpath = fqpath[len(tgtpath) + 1:]

            if fpath in in_db:
                spinner2.next(star=True, vlen=0)
                have_files.append(fpath)
            else:
                spinner2.next(vlen=1)
                agg_files.append(fpath)
                fqpath = os.path.join(tgtpath, fpath)
                # os.unlink(fqpath)
                print("\rDeleting: %s  " % fqpath)

Exemple #50

0

Afficher le fichier

def exposed_purge_raw_invalid_urls():
    '''
	Delete all raw-archiver rows that aren't
	attached to a archiver module.
	'''

    sess = db.get_db_session()

    bad = 0
    for row in sess.query(db.RawWebPages).yield_per(1000).all():
        if not any([
                mod.cares_about_url(row.url)
                for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES
        ]):
            print("Unwanted: ", row.url)
            sess.delete(row)
            bad += 1
        if bad > 5000:
            print("Committing!")
            bad = 0
            sess.commit()
    sess.commit()

Exemple #51

0

Afficher le fichier

def dump_scheduled_jobs(sched):
	print("Scheduled jobs:")
	existing = sched.get_jobs()
	if not existing:
		print("	No jobs in scheduler!")

	tznow = datetime.datetime.now(tz=pytz.utc)
	for job in existing:
		print("	", job, job.args, "running in:", job.next_run_time - tznow, (job.id, ))

	session = db.get_db_session()
	running = session.query(db.PluginStatus).filter(db.PluginStatus.is_running == True).all()
	print("Running jobs:")
	for jitem in running:
		print("	", jitem.plugin_name, jitem.is_running, jitem.last_run, jitem.last_error, jitem.last_error_msg)
	if not running:
		print("	<None!>")

	print("Running threads:")
	for thread in threading.enumerate():
		print("	", thread.getName(), thread)
	db.delete_db_session()

Exemple #52

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_fetch(url, debug=True, rss_debug=False):
	'''
	Do a synchronous fetch of content from url `url`.
	'''

	# try:
	# 	WebMirror.SpecialCase.startAmqpFetcher()
	# except RuntimeError:  # Fetcher already started
	# 	pass

	if rss_debug:
		print("Debugging RSS")
		flags.RSS_DEBUG = True

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)

	try:
		archiver = SiteArchiver(None, db.get_db_session(), None)
		job     = archiver.synchronousJobRequest(url, ignore_cache=True)
	except Exception as e:
		traceback.print_exc()
	finally:
		db.delete_db_session()

Exemple #53

0

Afficher le fichier

Fichier : FeedDbManage.py Projet : sikopet/ReadableWebProxy

def do_db_sync():

    sess = db.get_db_session()
    res = sess.query(db.RssFeedEntry) \
     .all()
    have_funcs = {row.feed_name: (row.func, row.last_changed) for row in res}
    sess.commit()

    this_dir = os.path.dirname(__file__)
    func_json_path = os.path.join(this_dir, "function_database.json")

    file_funcs = {}
    try:
        if os.path.exists(func_json_path):
            with open(func_json_path, "r") as fp:
                data = fp.read()
                if data:
                    file_funcs = json.loads(data)
    except json.JSONDecodeError:
        pass

    if have_funcs == file_funcs:
        print("Function storage file is up-to-date. Nothing to do!")
        return

    print("Updating function database file.")

    def datetime_handler(x):
        if isinstance(x, datetime.datetime):
            return x.isoformat()
        raise TypeError("Unknown type")

    with open(func_json_path, "w") as fp:
        json.dump(have_funcs,
                  fp,
                  indent=True,
                  sort_keys=True,
                  default=datetime_handler)

Exemple #54

0

Afficher le fichier

Fichier : Flatten.py Projet : fake-name/ReadableWebProxy

	def fix_missing_history(self):

		sess = db.get_db_session()
		self.qlog.info("Querying for items without any history")
		end = sess.execute("""
			SELECT
				t1.url
			FROM
				web_pages t1
			LEFT JOIN
				web_pages_version t2 ON t2.url = t1.url
			WHERE
				t2.url IS NULL
			""")
		end = [tmp[0] for tmp in end]

		self.log.info("Found %s rows missing history content!", len(end))

		remaining = len(end)
		for urlset in batch(end, 50):
			remaining = remaining - len(urlset)
			self.tickle_rows(sess, urlset)
			self.log.info("Processed %s of %s (%s%%)", len(end)-remaining, len(end), 100-((remaining/len(end)) * 100) )

Exemple #55

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_nu_new():
	'''
	Parse outbound netlocs from NovelUpdates releases, extracting
	any sites that are not known in the feednamelut.
	'''

	import WebMirror.OutputFilters.util.feedNameLut as fnl
	sess = db.get_db_session()

	nu_items = sess.query(db.NuOutboundWrapperMap)             \
		.filter(db.NuOutboundWrapperMap.validated == True)     \
		.filter(db.NuOutboundWrapperMap.actual_target != None) \
		.all()

	netlocs = [urllib.parse.urlsplit(row.actual_target).netloc for row in nu_items]
	print("Nu outbound items: ", len(netlocs))
	netlocs = set(netlocs)

	for netloc in netlocs:
		if not fnl.getNiceName(None, netloc):
			fnl.getNiceName(None, netloc, debug=True)
			print("Missing: ", netloc)
	print("Nu outbound items: ", len(netlocs))

Exemple #56

0

Afficher le fichier

Fichier : runScheduler.py Projet : fake-name/ReadableWebProxy

	def __init__(self, job_name):

		if not job_name in CALLABLE_LUT:
			raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT))
		self.runModule = CALLABLE_LUT[job_name]
		self.job_name = job_name


		session = db.get_db_session()

		try:
			query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name)
			have = query.scalar()
			if not have:
				new = db.PluginStatus(plugin_name=job_name)
				session.add(new)
				session.commit()
		except sqlalchemy.exc.OperationalError:
			session.rollback()
		except sqlalchemy.exc.InvalidRequestError:
			session.rollback()

		finally:
			db.delete_db_session()

Exemple #57

0

Afficher le fichier

	def __init__(self, job_name):

		if not job_name in CALLABLE_LUT:
			raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT))
		self.runModule = CALLABLE_LUT[job_name]
		self.job_name = job_name


		session = db.get_db_session()

		try:
			query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name)
			have = query.scalar()
			if not have:
				new = db.PluginStatus(plugin_name=job_name)
				session.add(new)
				session.commit()
		except sqlalchemy.exc.OperationalError:
			session.rollback()
		except sqlalchemy.exc.InvalidRequestError:
			session.rollback()

		finally:
			db.delete_db_session()

Exemple #58

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_fix_null():
	'''
	Reset any rows in the table where the `ignoreuntiltime` column
	is null. Updates in 50K row increments.11
	'''
	step = 50000


	end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE  ignoreuntiltime IS NULL;""")
	end = list(end)[0][0]

	start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""")
	start = list(start)[0][0]

	changed = 0

	if not start:
		print("No null rows to fix!")
		return

	start = start - (start % step)

	for x in range(start, end, step):
		# SQL String munging! I'm a bad person!
		# Only done because I can't easily find how to make sqlalchemy
		# bind parameters ignore the postgres specific cast
		# The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
		have = db.get_db_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step))
		# print()
		print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount))
		changed += have.rowcount
		if changed > 10000:
			print("Committing (%s changed rows)...." % changed, end=' ')
			db.get_db_session().commit()
			print("done")
			changed = 0
	db.get_db_session().commit()

Exemple #59

0

Afficher le fichier

Fichier : views.py Projet : kfrz/ReadableWebProxy

def before_request():
    g.locale = 'en'
    g.session = database.get_db_session(flask_sess_if_possible=False)
    print("Checked out session")

Exemple #60

0

Afficher le fichier

Fichier : WebMirrorManage.py Projet : fake-name/ReadableWebProxy

def exposed_rss_db_sync(target = None, days=False, silent=False):
	'''
	Feed RSS feed history through the feedparsing system, generating a log
	file of the feed articles that were not captured by the feed parsing system.

	Target is an optional netloc. If not none, only feeds with that netloc are
		processed.
	Days is the number of days into the past to process. None results in all
		available history being read.
	Silent suppresses some debug printing to the console.
	'''

	json_file = 'rss_filter_misses-1.json'

	write_debug = True
	if silent:
		config.C_DO_RABBIT = False
	if target:
		config.C_DO_RABBIT = False
		flags.RSS_DEBUG    = True
		write_debug = False
	else:
		try:
			os.unlink(json_file)
		except FileNotFoundError:
			pass

	import WebMirror.processor.RssProcessor
	parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath   = "Main.RssDb",
															pageUrl     = 'http://www.example.org',
															pgContent   = '',
															type        = 'application/atom+xml',
															transfer    = False,
															debug_print = True,
															db_sess = None,
															write_debug = write_debug)


	print("Getting feed items....")

	if target:
		print("Limiting to '%s' source." % target)
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.srcname == target)    \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()
	elif days:
		print("RSS age override: ", days)
		cutoff = datetime.datetime.now() - datetime.timedelta(days=days)
		feed_items = db.get_db_session().query(db.FeedItems) \
				.filter(db.FeedItems.published > cutoff)  \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)             \
				.all()
	else:
		feed_items = db.get_db_session().query(db.FeedItems) \
				.order_by(db.FeedItems.srcname)           \
				.order_by(db.FeedItems.title)           \
				.all()


	print("Feed items: ", len(feed_items))

	for item in feed_items:
		ctnt = {}
		ctnt['srcname']   = item.srcname
		ctnt['title']     = item.title
		ctnt['tags']      = item.tags
		ctnt['linkUrl']   = item.contenturl
		ctnt['guid']      = item.contentid
		ctnt['published'] = calendar.timegm(item.published.timetuple())

		# Pop()ed off in processFeedData().
		ctnt['contents']  = 'wat'

		try:
			parser.processFeedData(ctnt, tx_raw=False, tx_parse=not bool(days))
		except ValueError:
			pass
		# print(ctnt)
	if target == None:
		exposed_sort_json(json_file)