def exposed_clear_bad(): ''' Iterate over all blocked strings from the various YAML rules, deleting any occurances of each from the database. SLOW ''' from sqlalchemy.dialects import postgresql rules = WebMirror.rules.load_rules() for ruleset in rules: print("Cleaning ruleset") # print(ruleset['netlocs']) # print(ruleset.keys()) for badword in ruleset['badwords']: if not ruleset['netlocs']: continue if "%" in badword: print(badword) else: print("Deleting items containing string: '%s'" % badword) q = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) items = q.count() if items: print("%s results for : '%s'" % (items, badword)) q = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .delete(synchronize_session=False) db.get_db_session().commit()
def exposed_purge_raw_invalid_urls_from_history(): ''' Delete all raw-archiver rows that aren't attached to a archiver module. ''' sess1 = db.get_db_session(postfix='iter_sess') sess2 = db.get_db_session(postfix='delete_sess') ctbl = version_table(db.RawWebPages.__table__) print("Loading files from database...") # spinner1 = Spinner() est = sess1.execute("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='raw_web_pages_version';") res = est.scalar() print("Estimated row-count: %s" % res) last_bad = "" deleted = 0 total_rows = 0 last_commit = 0 maxlen = 0 changed_rows = 0 with tqdm.tqdm(total=res) as pbar: bad = 0 for rurl, rnetloc in sess1.query(ctbl.c.url, ctbl.c.netloc).yield_per(1000): modules_wants_url = any([mod.cares_about_url(rurl) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES]) has_badwords = any([badword in rurl for badword in common.global_constants.GLOBAL_BAD_URLS]) if not modules_wants_url or has_badwords: last_bad = rnetloc # print("Unwanted: ", rurl) changed_rows = sess2.query(ctbl) \ .filter(ctbl.c.url == rurl) \ .delete(synchronize_session=False) bad += 1 deleted += 1 total_rows += 1 if bad > 5000: # print("Committing!") bad = 0 last_commit = deleted sess2.commit() # pbar.set_description("Doing Commit", refresh=True) else: msg = "Deleted: %s, since commit: %s, last_bad: '%s' (%s, %s%%)" % \ (deleted, deleted-last_commit, last_bad, changed_rows, 100.0*(deleted / total_rows)) maxlen = max(len(msg), maxlen) pbar.set_description(msg.ljust(maxlen), refresh=False) pbar.update(n=1) sess1.commit() sess2.commit()
def exposed_update_feed_names(): ''' Apply any new feednamelut names to existing fetched RSS posts. ''' for key, value in feedNameLut.mapper.items(): feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == key) \ .all() if feed_items: for item in feed_items: item.srcname = value print(len(feed_items)) print(key, value) db.get_db_session().commit()
def resetRunStates(): print("JobSetup call resetting run-states!") session = db.get_db_session() session.query(db.PluginStatus).update({db.PluginStatus.is_running : False}) session.commit() db.delete_db_session() print("Run-states reset.")
def consolidate_history(self): sess = db.get_db_session() self.qlog.info("Querying for items with significant history size") end = sess.execute(""" SELECT count(*), url FROM web_pages_version GROUP BY url HAVING COUNT(*) > 10 ORDER BY url """) end = list(end) self.qlog.info("Found %s items with more then 10 history entries. Processing", len(end)) for count, url in end: while 1: try: self.truncate_url_history(url) break except sqlalchemy.exc.OperationalError: self.sess.rollback()
def exposed_delete_nu_unresolved(): ''' Delete all nu head system rows that have not been reviewed. This is needed for historical purges, particularly if nu changes their extnu ids. ''' sess = db.get_db_session() count = 0 for row in sess.query(db.NuReleaseItem) \ .yield_per(50).all(): if len(list(row.resolved)) != 3 and row.reviewed == 'unverified': print(row.id, len(list(row.resolved)), row.referrer) for bad in row.resolved: sess.delete(bad) sess.delete(row) count += 1 # if count % 500 == 0: # print("Committing!") # sess.commit() print("Committing!") sess.commit()
def exposed_delete_spcnet_invalid_url_pages(): ''' So the spcnet.tv forum software generates THOUSANDS of garbage links somehow. Anyways, delete those. ''' sess = db.get_db_session() tables = [ db.WebPages.__table__, version_table(db.WebPages) ] for ctbl in tables: # Print Querying for affected rows q = sess.query(ctbl.c.id) \ .filter(ctbl.c.netloc == "www.spcnet.tv") \ .filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%')) print("Query:") print(q) ids = q.all() ids = set(ids) # Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs. ids = [tmp[0] for tmp in ids] print("Fount %s rows requring deletion. Deleting." % len(ids)) delete_internal(sess, ids) sess.commit()
def exposed_delete_feed(feed_name, do_delete, search_str): ''' Feed name is the readable name of the feed, from feedNameLut.py. do delete is a boolean that determines if the deletion is actually done, or the actions are just previewed. Unless do_delete.lower() == "true", no action will actually be taken. search_str is the string of items to search for. Searches are case sensitive, and the only component of the feed that are searched within is the title. search_str is split on the literal character "|", for requiring multiple substrings be in the searched title. Delete the rss entries for a feed, using a search key. ''' sess = db.get_db_session() items = sess.query(db.FeedItems) \ .filter(db.FeedItems.srcname == feed_name) \ .all() do_delete = "true" in do_delete.lower() searchitems = search_str.split("|") for item in items: itemall = " ".join([item.title] + item.tags) if all([searchstr in itemall for searchstr in searchitems]): print(itemall) if do_delete: print("Deleting item") sess.delete(item) sess.commit()
def doCall(self): self.log.info("Calling job %s", self.job_name) session = db.get_db_session() item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one() if item.is_running: session.commit() self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name) return item.is_running = True item.last_run = datetime.datetime.now() session.commit() try: self._doCall() except Exception: item.last_error = datetime.datetime.now() item.last_error_msg = traceback.format_exc() raise finally: item2 = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one() item2.is_running = False item2.last_run_end = datetime.datetime.now() session.commit() db.delete_db_session() self.log.info("Job %s complete.", self.job_name)
def exposed_delete_comment_feed_items(): ''' Iterate over all retreived feed article entries, and delete any that look like they're comment feed articles. ''' sess = db.get_db_session() bad = sess.query(db.FeedItems) \ .filter(or_( db.FeedItems.contenturl.like("%#comment-%"), db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"), db.FeedItems.contenturl.like("%www.fanfiction.net%"), db.FeedItems.contenturl.like("%www.fictionpress.com%"), db.FeedItems.contenturl.like("%www.booksie.com%"))) \ .order_by(db.FeedItems.contenturl) \ .all() count = 0 for bad in bad: print(bad.contenturl) while bad.author: bad.author.pop() while bad.tags: bad.tags.pop() sess.delete(bad) count += 1 if count % 1000 == 0: print("Committing at %s" % count) sess.commit() print("Done. Committing...") sess.commit()
def qidianSmartFeedFetch(params, rid, joburl, netloc, job_aggregator_instance): print('qidianSmartFeedFetch', params, rid, joburl, netloc) sess = db.get_db_session(flask_sess_if_possible=False) have = sess.query(db.QidianFeedPostMeta).order_by( desc(db.QidianFeedPostMeta.id)).limit(500).all() meta_dict = {} for row in have: meta_dict[row.contentid] = row.meta sess.commit() raw_job = WebMirror.JobUtils.buildjob( module='SmartWebRequest', call='qidianSmartFeedFetch', dispatchKey="fetcher", jobid=rid, args=[joburl], kwargs={'meta': meta_dict}, additionalData={}, postDelay=0, serialize="QidianModule", ) # print("Raw job:") # print(raw_job) # return raw_job job_aggregator_instance.put_job(raw_job)
def initializeStartUrls(rules): print("Initializing all start URLs in the database") sess = db.get_db_session() for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url = starturl, starturl = starturl, netloc = netloc, type = ruleset['type'], priority = db.DB_IDLE_PRIORITY, distance = db.DB_DEFAULT_DIST, normal_fetch_mode = ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback() sess.close() db.delete_db_session()
def exposed_dump_raw_feed_data(): ''' Dump the raw feed data to a json file. ''' import json sess = db.get_db_session() print("Selecting 1") feed_pages = sess.execute("SELECT * FROM feed_pages;") print("Selecting 2") nu_outbound_wrappers = sess.execute("SELECT * FROM nu_outbound_wrappers;") ret = {} print("processing ret 1") cols_feed = ('id', 'type', 'srcname', 'feedurl', 'contenturl', 'contentid', 'title', 'contents', 'updated', 'published', 'feed_id') ret['feed_pages'] = ret_to_dict_list(cols_feed, feed_pages) print("processing ret 2") nucols = [ 'id', 'actual_target', 'client_id', 'client_key', 'groupinfo', 'outbound_wrapper', 'referrer', 'releaseinfo', 'seriesname', 'validated', 'released_on' ] ret['nu_outbound_wrappers'] = ret_to_dict_list(nucols, nu_outbound_wrappers) print("Dumping ret") with open( "db_bak_{}.json".format( str(datetime.datetime.now()).replace(":", "-").replace(" ", "_")), "w") as fp: json.dump(ret, fp, indent=" ")
def exposed_astor_roundtrip_parser_functions(): ''' Shove the feed-functions through the astor "round-trip" facility. Mostly, this homogenizes the indentation, and reformats the function. ''' sess = db.get_db_session() res = sess.query(db.RssFeedEntry) \ .all() for row in res: func = row.get_func() _ast = row._get_ast() src = astor.to_source(_ast, indent_with=" ", pretty_source=better_pretty_source) if src.strip() != row.func.strip(): try: rfdb.str_to_function(src, "testing_compile") print("Compiled OK") row.func = src except Exception: print("Compilation failed?") sess.commit()
def exposed_delete_nu_unresolved(): ''' Delete all nu head system rows that have not been reviewed. This is needed for historical purges, particularly if nu changes their extnu ids, or if the url masking mechanism has significant changes. ''' sess = db.get_db_session() count = 0 print("Loading rows....") rows = sess.query(db.NuReleaseItem) \ .options(joinedload('resolved')) \ .all() print("Loaded %s rows. Scanning." % len(rows)) for row in rows: if len(list(row.resolved)) == 0 and row.reviewed == 'unverified': print(row.id, len(list(row.resolved)), row.referrer) for bad in row.resolved: sess.delete(bad) sess.delete(row) count += 1 if count % 500 == 0: print("Committing!") sess.commit() print("Committing!") sess.commit()
def exposed_longest_rows(): ''' Fetch the rows from the database where the `content` field is longest. Return is limited to the biggest 50 rows. VERY SLOW (has to scan the entire table) ''' print("Getting longest rows from database") have = db.get_db_session().execute(""" SELECT id, url, length(content), content FROM web_pages ORDER BY LENGTH(content) DESC NULLS LAST LIMIT 50; """) print("Rows:") import os import os.path savepath = "./large_files/" for row in have: print(row[0], row[1]) try: os.makedirs(savepath) except FileExistsError: pass with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp: urlst = "URL: %s\n\n" % row[1] size = "Length: %s\n\n" % row[2] fp.write(urlst.encode("utf-8")) fp.write(size.encode("utf-8")) fp.write("{}".format(row[3]).encode("utf-8"))
def __init__(self, connect=True): super().__init__() self.name_lut, self.group_lut = load_lut() self.db_sess = db.get_db_session(postfix='nu_header') if connect: self.check_open_rpc_interface()
def exposed_process_nu_pages(transmit=True): ''' Re-process all locally saved novelupdates pages. ''' wg = common.util.webFunctions.WebGetRobust() sess = db.get_db_session() if transmit == True: print("Transmitting processed results") rm = common.RunManager.Crawler(1, 1) message_q = rm.start_aggregator() else: print("Not translating processed results") message_q = queue.Queue() pages = [] print("Beginning DB retreival") for row in sess.query(db.WebPages) \ .filter(db.WebPages.netloc == "www.novelupdates.com") \ .filter(db.WebPages.url.ilike("%/series/%")) \ .yield_per(50).all(): rowtmp = { "pageUrl" : row.url, "pgContent" : row.content, "type" : row.mimetype, "wg" : wg, "message_q" : message_q, } pages.append(rowtmp) if len(pages) % 100 == 0: print("Loaded %s pages..." % len(pages)) sess.flush() sess.commit() for row in pages: try: # print(row, row.url, row.state) if row['pgContent'] and NuSeriesPageFilter.NUSeriesPageProcessor.wantsUrl(row['pageUrl']): proc = NuSeriesPageFilter.NUSeriesPageProcessor(db_sess=sess, **row) proc.extractContent() except Exception: print("") print("ERROR!") for line in traceback.format_exc().split("\n"): print(line.rstrip()) print("") except KeyboardInterrupt: break runStatus.run_state.value = 0 if transmit == True: rm.join_aggregator() print(sess)
def wg(self): if getattr(self, '_SiteArchiver__wg', None) is None: print("Creating WG Interface!") alt_cj = dbCj.DatabaseCookieJar(db=self.db, session=db.get_db_session(postfix="_cookie_interface")) self.__wg = WebRequest.WebGetRobust( use_socks = self.__wr_use_socks, alt_cookiejar = alt_cj, custom_ua = self.__wr_ua_override, ) return self.__wg
def sync_raw_with_filesystem(): sess = db.get_db_session() print("Loading files from database...") spinner1 = Spinner() in_db = [] for row in sess.query(db.RawWebPages).yield_per(1000): if row.fspath: in_db.append(row.fspath) spinner1.next(vlen=len(row.fspath)) else: spinner1.next(star=True) in_db = set(in_db) tgtpath = settings.RAW_RESOURCE_DIR print("") print("Enumerating files from disk...") agg_files = [] have_files = [] spinner2 = Spinner() for root, dirs, files in os.walk(tgtpath): for filen in files: fqpath = os.path.join(root, filen) fpath = fqpath[len(tgtpath) + 1:] if fpath in in_db: spinner2.next(star=True, vlen=0) have_files.append(fpath) else: spinner2.next(vlen=1) agg_files.append(fpath) fqpath = os.path.join(tgtpath, fpath) os.unlink(fqpath) print("\rDeleting: %s " % fqpath) print() print("Found %s files (%s unique)" % (len(agg_files), len(set(agg_files)))) missing_files = set(in_db) - set(have_files) for filen in agg_files: print("Should delete: '%s'" % filen) for filen in missing_files: print("Missing: '%s'" % filen) sess.query( db.RawWebPages).filter(db.RawWebPages.fspath == filen).update({ "state": "new", "fspath": None }) sess.commit()
def exposed_process_nu_pages(transmit=True): ''' Re-process all locally saved novelupdates pages. ''' wg = common.util.webFunctions.WebGetRobust() sess = db.get_db_session() if transmit == True: rm = common.RunManager.Crawler(1, 1) message_q = rm.start_aggregator() else: message_q = queue.Queue() pages = [] for row in sess.query(db.WebPages) \ .filter(db.WebPages.netloc == "www.novelupdates.com") \ .yield_per(50).all(): rowtmp = { "pageUrl" : row.url, "pgContent" : row.content, "type" : row.mimetype, "wg" : wg, "message_q" : message_q, } pages.append(rowtmp) if len(pages) == 100: print("Loaded %s pages..." % len(pages)) sess.flush() sess.commit() for row in pages: try: # print(row, row.url, row.state) if row['pgContent'] and NuSeriesPageFilter.NUSeriesPageProcessor.wantsUrl(row['pageUrl']): proc = NuSeriesPageFilter.NUSeriesPageProcessor(db_sess=sess, **row) proc.extractContent() except Exception: print("") print("ERROR!") for line in traceback.format_exc().split("\n"): print(line.rstrip()) print("") except KeyboardInterrupt: break runStatus.run_state.value = 0 if transmit == True: rm.join_aggregator() print(sess)
def exposed_reset_raw_missing(): ''' Retrigger all raw-archive links that don't seem to have a corresponding file on-disk. ''' sess = db.get_db_session() bad = 0 for row in sess.query(db.RawWebPages).yield_per(1000): if row.fspath: nl, rest = row.fspath.split("/", 1) nl = nl.split(".") nl.reverse() nl = "/".join(nl) newp = nl + "/" + rest old = os.path.join(C_RAW_RESOURCE_DIR, "old", row.fspath) new = os.path.join(C_RAW_RESOURCE_DIR, newp) if os.path.exists(new) and row.fspath == newp: #print("Nothing to do: ", row.fspath, new, newp) pass elif os.path.exists(new): print("Relinking: ", newp, row.fspath) row.fspath = to_locpath(new) bad += 1 elif os.path.exists(old): dirPath = os.path.split(new)[0] if not os.path.exists(dirPath): os.makedirs(dirPath) shutil.move(old, new) row.fspath = to_locpath(new) bad += 1 print("Moving: ", old, new) else: row.state = "new" bad += 1 else: row.state = "new" bad += 1 if bad > 25000: print("Committing!") bad = 0 sess.commit() sess.commit()
def do_task(self): db_handle = db.get_db_session() hadjob = False try: self.archiver = WebMirror.Engine.SiteArchiver(self.cookie_lock, new_job_queue=self.new_job_queue, response_queue=self.resp_queue, db_interface=db_handle) hadjob = self.archiver.taskProcess() finally: # Clear out the sqlalchemy state db_handle.expunge_all() db.delete_db_session() return hadjob
def exposed_reset_raw_missing(): ''' Retrigger all raw-archive links that don't seem to have a corresponding file on-disk. ''' sess = db.get_db_session() bad = 0 for row in sess.query(db.RawWebPages).yield_per(1000).all(): if row.fspath: nl, rest = row.fspath.split("/", 1) nl = nl.split(".") nl.reverse() nl = "/".join(nl) newp = nl + "/" + rest old = os.path.join(C_RAW_RESOURCE_DIR, "old", row.fspath) new = os.path.join(C_RAW_RESOURCE_DIR, newp) if os.path.exists(new) and row.fspath == newp: #print("Nothing to do: ", row.fspath, new, newp) pass elif os.path.exists(new): print("Relinking: ", newp, row.fspath) row.fspath = to_locpath(new) bad += 1 elif os.path.exists(old): dirPath = os.path.split(new)[0] if not os.path.exists(dirPath): os.makedirs(dirPath) shutil.move(old, new) row.fspath = to_locpath(new) bad += 1 print("Moving: ", old, new) else: row.state = "new" bad += 1 else: row.state = "new" bad += 1 if bad > 25000: print("Committing!") bad = 0 sess.commit() sess.commit()
def sync_raw_with_filesystem(): sess = db.get_db_session() print("Loading files from database...") spinner1 = Spinner() in_db = [] for row in sess.query(db.RawWebPages).yield_per(1000): if row.fspath: in_db.append(row.fspath) spinner1.next(vlen=len(row.fspath)) else: spinner1.next(star=True) in_db = set(in_db) tgtpath = settings.RAW_RESOURCE_DIR print("") print("Enumerating files from disk...") agg_files = [] have_files = [] spinner2 = Spinner() for root, dirs, files in os.walk(tgtpath): for filen in files: fqpath = os.path.join(root, filen) fpath = fqpath[len(tgtpath)+1:] if fpath in in_db: spinner2.next(star=True, vlen=0) have_files.append(fpath) else: spinner2.next(vlen=1) agg_files.append(fpath) fqpath = os.path.join(tgtpath, fpath) os.unlink(fqpath) print("\rDeleting: %s " % fqpath) print() print("Found %s files (%s unique)" % (len(agg_files), len(set(agg_files)))) missing_files = set(in_db) - set(have_files) for filen in agg_files: print("Should delete: '%s'" % filen) for filen in missing_files: print("Missing: '%s'" % filen) sess.query(db.RawWebPages).filter(db.RawWebPages.fspath == filen).update({"state" : "new", "fspath" : None}) sess.commit()
def reset_homepages(): import tqdm import common.database as db sess = db.get_db_session() for pageno in tqdm.trange(1, 1001): url = "https://www.novelupdates.com/?pg=%d" % pageno have = sess.query(db.WebPages) \ .filter(db.WebPages.url==url) \ .scalar() if have: have.state = 'new' have.epoch = 0 sess.commit()
def resetInProgress(): print("Resetting any stalled downloads from the previous session.") sess = db.get_db_session() sess.query(db.WebPages) \ .filter( (db.WebPages.state == "fetching") | (db.WebPages.state == "processing") | (db.WebPages.state == "specialty_deferred") | (db.WebPages.state == "specialty_ready") ) \ .update({db.WebPages.state : "new"}) sess.commit() sess.close() db.delete_db_session()
def exposed_delete_gravitytales_bot_blocked_pages(): ''' Delete the "checking you're not a bot" garbage pages that sometimes get through the gravitytales scraper. ''' sess = db.get_db_session() tables = [db.WebPages.__table__, version_table(db.WebPages)] for ctbl in tables: update = ctbl.delete() \ .where(ctbl.c.netloc == "gravitytales.com") \ .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%')) print(update) sess.execute(update) sess.commit()
def exposed_import_feed_parse_funcs(): ''' Import the feed parsing functions into the database. ''' sess = db.get_db_session() # parse_map = WebMirror.OutputFilters.rss.FeedDataParser.RSS_PARSE_FUNCTION_MAP # for key, func in parse_map.items(): # func_str = astor.to_source(astor.code_to_ast(func), indent_with=" ") # update_func(sess, key, func_str) name_map = WebMirror.OutputFilters.util.feedNameLut.mapper for key, val in name_map.items(): add_name(sess, key, val)
def get_times(self): conn = database.get_db_session() aps = conn.execute("SELECT job_state FROM apscheduler_jobs;") update_times = [] for blob, in aps: job_dict = pickle.loads(blob) update_times.append( (job_dict['id'], job_dict['next_run_time'].isoformat())) data = { "update-times": update_times, } database.delete_db_session() return pack_message("system-update-times", data)
def exposed_raw_test_retrieve(url): ''' Lower level fetch test, otherwise similar to `test_retreive` ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) sess = db.get_db_session() row = sess.query(db.RawWebPages).filter(db.RawWebPages.url == url).scalar() if row: row.state = 'new' else: row = db.RawWebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, priority = 500000, state = 'new', fetchtime = datetime.datetime.now(), ) sess.add(row) try: archiver = RawArchiver.RawEngine.RawSiteArchiver( total_worker_count = 1, worker_num = 0, new_job_queue = None, cookie_lock = None, db_interface = sess, response_queue = None ) job = archiver.do_job(row) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def clean_files(self): session = db.get_db_session() q = session.query(db.WebFiles).filter(db.WebFiles.fspath != None) self.log.info("Querying for non-null filepaths...") have = q.all() self.log.info("Have %s local files.", len(have)) count = 0 for file in have: fpath = os.path.join(settings.RESOURCE_DIR, file.fspath) if not os.path.exists(fpath): self.log.error("Missing file: %s", fpath) count += 1 if count % 1000 == 0: self.log.info("Scanned %s files.", count)
def launch_agg(cls, agg_queue): try: common.stuck.install_pystuck() agg_db = db.get_db_session() instance = cls(agg_queue, agg_db) instance.run() instance.close() except Exception as e: import traceback print() print() print() print() print() print() print("Aggregator exception!") traceback.print_exc()
def exposed_print_scheduled_jobs(): ''' ''' sess = db.get_db_session() items = sess.execute(""" SELECT id, next_run_time , job_state FROM apscheduler_jobs """) items = list(items) for tid, nextcall, content in items: print("Job: ", tid.ljust(30), str(nextcall).rjust(20)) dat = pickle.loads(content) pprint.pprint(dat)
def get_times(self): conn = database.get_db_session() aps = conn.execute("SELECT job_state FROM apscheduler_jobs;") update_times = [] for blob, in aps: job_dict = pickle.loads(blob) update_times.append(( job_dict['id'], job_dict['next_run_time'].isoformat() )) data = { "update-times" : update_times, } database.delete_db_session() return pack_message("system-update-times", data)
def exposed_drop_priorities(): ''' Reset the priority of every row in the table to the IDLE_PRIORITY level ''' step = 10000 sess = db.get_db_session() print("Getting minimum row in need or update..") start = sess.execute("""SELECT min(id) FROM web_pages WHERE priority < 500000""") start = list(start)[0][0] print("Minimum row ID: ", start, "getting maximum row...") stop = sess.execute("""SELECT max(id) FROM web_pages WHERE priority < 500000""") stop = list(stop)[0][0] print("Maximum row ID: ", stop) if not start: print("No null rows to fix!") return print("Need to fix rows from %s to %s" % (start, stop)) start = start - (start % step) changed = 0 for idx in range(start, stop, step): # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = sess.execute("""update web_pages set priority = 500000 where priority < 500000 AND id > {} AND id <= {};""".format(idx, idx+step)) # print() processed = idx - start total_todo = stop - start print('%10i, %10i, %7.4f, %6i' % (idx, stop, processed/total_todo * 100, have.rowcount)) changed += have.rowcount if changed > 100000: print("Committing (%s changed rows)...." % changed, end=' ') sess.commit() print("done") changed = 0 sess.commit()
def exposed_purge_raw_invalid_urls(): ''' Delete all raw-archiver rows that aren't attached to a archiver module. ''' sess = db.get_db_session() bad = 0 for row in sess.query(db.RawWebPages).yield_per(1000).all(): if not any([mod.cares_about_url(row.url) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES]): print("Unwanted: ", row.url) sess.delete(row) bad += 1 if bad > 5000: print("Committing!") bad = 0 sess.commit() sess.commit()
def exposed_underp_rss_functions(): ''' Do stupid fixes to the RSS database. ''' bad = ''' if not (chp or vol) or 'preview' in item['title'].lower(): return False''' good = ''' if not (chp or vol) or 'preview' in item['title'].lower(): return None''' sess = db.get_db_session() rows = sess.query(db.RssFeedEntry).all() for row in rows: if bad in row.func: row.func = row.func.replace(bad, good) print(row) print(row.func) sess.commit() pass
def exposed_delete_old_nu_root_outbound(): ''' Delete NU outbound links that use the homepage as their referrer. Apparently NU was validating the referrer to see if the referring page actually had the referring link on it, or /something/. Anyways, it's easier to generate a permanent referrer by just pointing it at the series page. ''' sess = db.get_db_session() for row in sess.query(db.NuReleaseItem) \ .filter(not_(db.NuReleaseItem.referrer.like("%novelupdates.com/series%"))) \ .yield_per(50).all(): if not len(list(row.resolved)): print(row.id, row.referrer) sess.delete(row) sess.commit()
def sync_filtered_with_filesystem(): tgtpath = settings.RESOURCE_DIR sess = db.get_db_session() print("Loading files from database...") spinner1 = Spinner() in_db = [] chunk_cnt = 0 for row in sess.query(db.WebFiles).yield_per(10000): chunk_cnt += 1 if row.fspath: in_db.append(row.fspath) spinner1.next(vlen=len(row.fspath), output=(chunk_cnt == 10)) if chunk_cnt == 40: chunk_cnt = 0 origl = len(in_db) in_db = set(in_db) print("") print("%s files, %s unique" % (origl, len(in_db))) print("Enumerating files from disk...") agg_files = [] have_files = [] spinner2 = Spinner() for root, dirs, files in os.walk(tgtpath): for filen in files: fqpath = os.path.join(root, filen) fpath = fqpath[len(tgtpath)+1:] if fpath in in_db: spinner2.next(star=True, vlen=0) have_files.append(fpath) else: spinner2.next(vlen=1) agg_files.append(fpath) fqpath = os.path.join(tgtpath, fpath) # os.unlink(fqpath) print("\rDeleting: %s " % fqpath)
def sync_filtered_with_filesystem(): tgtpath = settings.RESOURCE_DIR sess = db.get_db_session() print("Loading files from database...") spinner1 = Spinner() in_db = [] chunk_cnt = 0 for row in sess.query(db.WebFiles).yield_per(10000): chunk_cnt += 1 if row.fspath: in_db.append(row.fspath) spinner1.next(vlen=len(row.fspath), output=(chunk_cnt == 10)) if chunk_cnt == 40: chunk_cnt = 0 origl = len(in_db) in_db = set(in_db) print("") print("%s files, %s unique" % (origl, len(in_db))) print("Enumerating files from disk...") agg_files = [] have_files = [] spinner2 = Spinner() for root, dirs, files in os.walk(tgtpath): for filen in files: fqpath = os.path.join(root, filen) fpath = fqpath[len(tgtpath) + 1:] if fpath in in_db: spinner2.next(star=True, vlen=0) have_files.append(fpath) else: spinner2.next(vlen=1) agg_files.append(fpath) fqpath = os.path.join(tgtpath, fpath) # os.unlink(fqpath) print("\rDeleting: %s " % fqpath)
def exposed_purge_raw_invalid_urls(): ''' Delete all raw-archiver rows that aren't attached to a archiver module. ''' sess = db.get_db_session() bad = 0 for row in sess.query(db.RawWebPages).yield_per(1000).all(): if not any([ mod.cares_about_url(row.url) for mod in RawArchiver.RawActiveModules.ACTIVE_MODULES ]): print("Unwanted: ", row.url) sess.delete(row) bad += 1 if bad > 5000: print("Committing!") bad = 0 sess.commit() sess.commit()
def dump_scheduled_jobs(sched): print("Scheduled jobs:") existing = sched.get_jobs() if not existing: print(" No jobs in scheduler!") tznow = datetime.datetime.now(tz=pytz.utc) for job in existing: print(" ", job, job.args, "running in:", job.next_run_time - tznow, (job.id, )) session = db.get_db_session() running = session.query(db.PluginStatus).filter(db.PluginStatus.is_running == True).all() print("Running jobs:") for jitem in running: print(" ", jitem.plugin_name, jitem.is_running, jitem.last_run, jitem.last_error, jitem.last_error_msg) if not running: print(" <None!>") print("Running threads:") for thread in threading.enumerate(): print(" ", thread.getName(), thread) db.delete_db_session()
def exposed_fetch(url, debug=True, rss_debug=False): ''' Do a synchronous fetch of content from url `url`. ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) try: archiver = SiteArchiver(None, db.get_db_session(), None) job = archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def do_db_sync(): sess = db.get_db_session() res = sess.query(db.RssFeedEntry) \ .all() have_funcs = {row.feed_name: (row.func, row.last_changed) for row in res} sess.commit() this_dir = os.path.dirname(__file__) func_json_path = os.path.join(this_dir, "function_database.json") file_funcs = {} try: if os.path.exists(func_json_path): with open(func_json_path, "r") as fp: data = fp.read() if data: file_funcs = json.loads(data) except json.JSONDecodeError: pass if have_funcs == file_funcs: print("Function storage file is up-to-date. Nothing to do!") return print("Updating function database file.") def datetime_handler(x): if isinstance(x, datetime.datetime): return x.isoformat() raise TypeError("Unknown type") with open(func_json_path, "w") as fp: json.dump(have_funcs, fp, indent=True, sort_keys=True, default=datetime_handler)
def fix_missing_history(self): sess = db.get_db_session() self.qlog.info("Querying for items without any history") end = sess.execute(""" SELECT t1.url FROM web_pages t1 LEFT JOIN web_pages_version t2 ON t2.url = t1.url WHERE t2.url IS NULL """) end = [tmp[0] for tmp in end] self.log.info("Found %s rows missing history content!", len(end)) remaining = len(end) for urlset in batch(end, 50): remaining = remaining - len(urlset) self.tickle_rows(sess, urlset) self.log.info("Processed %s of %s (%s%%)", len(end)-remaining, len(end), 100-((remaining/len(end)) * 100) )
def exposed_nu_new(): ''' Parse outbound netlocs from NovelUpdates releases, extracting any sites that are not known in the feednamelut. ''' import WebMirror.OutputFilters.util.feedNameLut as fnl sess = db.get_db_session() nu_items = sess.query(db.NuOutboundWrapperMap) \ .filter(db.NuOutboundWrapperMap.validated == True) \ .filter(db.NuOutboundWrapperMap.actual_target != None) \ .all() netlocs = [urllib.parse.urlsplit(row.actual_target).netloc for row in nu_items] print("Nu outbound items: ", len(netlocs)) netlocs = set(netlocs) for netloc in netlocs: if not fnl.getNiceName(None, netloc): fnl.getNiceName(None, netloc, debug=True) print("Missing: ", netloc) print("Nu outbound items: ", len(netlocs))
def __init__(self, job_name): if not job_name in CALLABLE_LUT: raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT)) self.runModule = CALLABLE_LUT[job_name] self.job_name = job_name session = db.get_db_session() try: query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name) have = query.scalar() if not have: new = db.PluginStatus(plugin_name=job_name) session.add(new) session.commit() except sqlalchemy.exc.OperationalError: session.rollback() except sqlalchemy.exc.InvalidRequestError: session.rollback() finally: db.delete_db_session()
def exposed_fix_null(): ''' Reset any rows in the table where the `ignoreuntiltime` column is null. Updates in 50K row increments.11 ''' step = 50000 end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") end = list(end)[0][0] start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") start = list(start)[0][0] changed = 0 if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_db_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount)) changed += have.rowcount if changed > 10000: print("Committing (%s changed rows)...." % changed, end=' ') db.get_db_session().commit() print("done") changed = 0 db.get_db_session().commit()
def before_request(): g.locale = 'en' g.session = database.get_db_session(flask_sess_if_possible=False) print("Checked out session")
def exposed_rss_db_sync(target = None, days=False, silent=False): ''' Feed RSS feed history through the feedparsing system, generating a log file of the feed articles that were not captured by the feed parsing system. Target is an optional netloc. If not none, only feeds with that netloc are processed. Days is the number of days into the past to process. None results in all available history being read. Silent suppresses some debug printing to the console. ''' json_file = 'rss_filter_misses-1.json' write_debug = True if silent: config.C_DO_RABBIT = False if target: config.C_DO_RABBIT = False flags.RSS_DEBUG = True write_debug = False else: try: os.unlink(json_file) except FileNotFoundError: pass import WebMirror.processor.RssProcessor parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath = "Main.RssDb", pageUrl = 'http://www.example.org', pgContent = '', type = 'application/atom+xml', transfer = False, debug_print = True, db_sess = None, write_debug = write_debug) print("Getting feed items....") if target: print("Limiting to '%s' source." % target) feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == target) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() elif days: print("RSS age override: ", days) cutoff = datetime.datetime.now() - datetime.timedelta(days=days) feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.published > cutoff) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() else: feed_items = db.get_db_session().query(db.FeedItems) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() print("Feed items: ", len(feed_items)) for item in feed_items: ctnt = {} ctnt['srcname'] = item.srcname ctnt['title'] = item.title ctnt['tags'] = item.tags ctnt['linkUrl'] = item.contenturl ctnt['guid'] = item.contentid ctnt['published'] = calendar.timegm(item.published.timetuple()) # Pop()ed off in processFeedData(). ctnt['contents'] = 'wat' try: parser.processFeedData(ctnt, tx_raw=False, tx_parse=not bool(days)) except ValueError: pass # print(ctnt) if target == None: exposed_sort_json(json_file)