def __load_referrer(self, base_url, item): if any([tmp.client_id == 'local' for tmp in item.resolved]): return # Don't dewaf old_autowaf = self.kwargs['wg_proxy']().rules['auto_waf'] self.kwargs['wg_proxy']().rules['auto_waf'] = False content, name, mime, url = self.kwargs['wg_proxy']( ).getFileNameMimeUrl(item.outbound_wrapper, addlHeaders={'Referer': base_url}) self.kwargs['wg_proxy']().rules['auto_waf'] = old_autowaf if url.startswith("https://www.novelupdates.com/extnu/"): raise RuntimeError("Failure when extracting NU referrer!") item.validated = True item.validated_on = datetime.datetime.now() item.actual_target = url item.reviewed = "manual_validate" pg_title = title_from_html(content) new = db.NuResolvedOutbound( client_id="local", client_key="local", actual_target=url, resolved_title=pg_title, fetched_on=datetime.datetime.now(), ) item.resolved.append(new) self.db_sess.commit() self.log.info("TL Group: %s. Series %s, chap: %s", item.groupinfo, item.seriesname, item.releaseinfo) self.log.info("URL '%s' resolved to '%s'", item.outbound_wrapper, item.actual_target) self.log.info("Page title: '%s'", pg_title) sleep = random.triangular(3, 10, 30) self.log.info("Sleeping %s", sleep) time.sleep(sleep)
def cross_sync(increment): sess = db.get_db_session() print("Loading extant rows...") old_nu_items = sess.query(db.NuOutboundWrapperMap).order_by(desc(db.NuOutboundWrapperMap.id)).all() print("Loaded. Processing") have_count = 0 new_count = 0 loops = 0 nc_loops = 0 try: for old_nu in old_nu_items: have = sess.query(db.NuReleaseItem) \ .options(joinedload('resolved')) \ .filter(db.NuReleaseItem.outbound_wrapper==old_nu.outbound_wrapper) \ .scalar() if not have: have = db.NuReleaseItem( validated = old_nu.validated, seriesname = old_nu.seriesname, releaseinfo = old_nu.releaseinfo, groupinfo = old_nu.groupinfo, referrer = old_nu.referrer, outbound_wrapper = old_nu.outbound_wrapper, first_seen = old_nu.released_on, actual_target = old_nu.actual_target, ) sess.add(have) loops += 1 new_count += 1 old_key = (old_nu.client_id, old_nu.client_key, old_nu.actual_target) resolved = set([(itm.client_id, itm.client_key, itm.actual_target) for itm in have.resolved]) if not old_key in resolved: new = db.NuResolvedOutbound( client_id = old_nu.client_id, client_key = old_nu.client_key, actual_target = old_nu.actual_target, fetched_on = old_nu.released_on, ) have.resolved.append(new) loops += 1 new_count += 1 else: have_count += 1 nc_loops += 1 if loops > increment: print("Commit! Have {}, new {} ({}, {})".format(have_count, new_count, loops, nc_loops)) sess.commit() loops = 0 if nc_loops > 100: print("Have {}, new {} ({}, {})".format(have_count, new_count, loops, nc_loops)) nc_loops = 0 sess.commit() except Exception: sess.rollback() raise
def process_single_source_target_mapping( self, from_outbound_wrapper, from_outbound_referrer, to_url, to_url_title, client_id, client_key ): if to_url.endswith("?m=1"): to_url = to_url[:-len("?m=1")] if "?utm_source=" in to_url: to_url = to_url.split("?utm_source=")[0] self.log.info("Processing remote head response: %s", (from_outbound_wrapper, from_outbound_referrer)) self.log.info("Resolved job to URL: %s", to_url) self.log.info("Page title: %s", to_url_title) # Handle the 301/2 not resolving properly. netloc = urllib.parse.urlsplit(to_url).netloc if "novelupdates" in netloc: self.mon_con.incr('head-failed', 1) self.log.warning("Failed to validate external URL. Either scraper is blocked, or phantomjs is failing.") return True if '/m.wuxiaworld.com' in to_url: to_url = to_url.replace('/m.wuxiaworld.com', '/www.wuxiaworld.com') # So dumb if '/foruwww.wuxiaworld.com' in to_url: to_url = to_url.replace('/foruwww.wuxiaworld.com', '/forum.wuxiaworld.com') if 'tseirptranslations.blogspot.com' in to_url: to_url = to_url.replace('tseirptranslations.blogspot.com', 'tseirptranslations.com') if 'babelnovel.com/rssbooks/' in to_url: to_url = to_url.replace('babelnovel.com/rssbooks/', 'babelnovel.com/books/') if 'm.xianxiaworld.net' in to_url: to_url = to_url.replace('m.xianxiaworld.net', 'www.xianxiaworld.net') if 'shikkakutranslations.wordpress.com' in to_url: to_url = to_url.replace('shikkakutranslations.wordpress.com', 'shikkakutranslations.com') if any([tmp in to_url for tmp in BAD_RESOLVES]): self.log.warning("Bad resolve in url: '%s'. Not inserting into DB.", to_url) self.mon_con.incr('head-failed', 1) return True if not to_url.lower().startswith("http"): self.log.warning("URL '%s' does not start with 'http'. Not inserting into DB.", to_url) self.mon_con.incr('head-failed', 1) return True to_url_title = self.check_resolve_locally(to_url, to_url_title) if '/?utm_source=feedburner' in to_url: to_url = to_url.split('/?utm_source=feedburner')[0] + "/" while True: with db.session_context() as db_sess: try: self.log.info("Trying for upsert") have = db_sess.query(db.NuReleaseItem) \ .options(joinedload('resolved')) \ .filter(db.NuReleaseItem.outbound_wrapper == from_outbound_wrapper) \ .filter(db.NuReleaseItem.referrer == from_outbound_referrer) \ .scalar() if not have: self.log.error("Base row deleted from resolve?") self.mon_con.incr('head-failed', 1) return if to_url_title is None: self.log.warning("Item didn't resolve to a name properly!") self.mon_con.incr('head-failed', 1) return if to_url_title.strip().lower() == to_url.strip().lower(): self.log.warning("Item didn't resolve to a name properly!") self.mon_con.incr('head-failed', 1) return new = db.NuResolvedOutbound( client_id = client_id, client_key = client_key, actual_target = to_url, resolved_title = to_url_title, fetched_on = datetime.datetime.now(), ) have.resolved.append(new) db_sess.commit() self.mon_con.incr('head-received', 1) return False except sqlalchemy.exc.InvalidRequestError as e: self.log.error("Exception: %s!", e) db_sess.rollback() except sqlalchemy.exc.OperationalError as e: self.log.error("Exception: %s!", e) db_sess.rollback() except sqlalchemy.exc.IntegrityError as e: self.log.error("Exception: %s!", e) db_sess.rollback() self.mon_con.incr('head-failed', 1) return False except Exception: self.log.error("Error when processing job response!") for line in traceback.format_exc().split("\n"): self.log.error(line) self.log.error("Contents of head response:") for line in pprint.pformat(new).split("\n"): self.log.error(line) self.mon_con.incr('head-failed', 1) return True self.mon_con.incr('head-failed', 1) return False
def process_single_avail(self): ''' Example response: { 'call': 'getHeadPhantomJS', 'cancontinue': True, 'dispatch_key': 'fetcher', 'extradat': {'mode': 'fetch'}, 'jobid': -1, 'jobmeta': {'sort_key': 'a269f164a16e11e6891500163ef6fe07'}, 'module': 'NUWebRequest', 'ret': 'http://lightnovels.world/the-nine-godheads/nine-godheads-chapter-74/', 'success': True, 'user': '******', 'user_uuid': 'urn:uuid:0a243518-834f-46d8-b34c-7f2afd20d37f' } ''' self.check_open_rpc_interface() while 1: try: new = self.rpc.get_job() break except TypeError: self.check_open_rpc_interface() except KeyError: self.check_open_rpc_interface() if new is None: self.log.info("No NU Head responses!") return False expected_keys = ['call', 'cancontinue', 'dispatch_key', 'extradat', 'jobid', 'jobmeta', 'module', 'ret', 'success', 'user', 'user_uuid'] while True: with db.session_context() as db_sess: try: assert all([key in new for key in expected_keys]) assert 'referrer' in new['extradat'] assert 'wrapper_url' in new['extradat'] if new['call'] == 'getHeadPhantomJS': respurl, title = new['ret'], "" elif new['call'] == 'getHeadTitlePhantomJS' or new['call'] == 'getHeadTitleChromium': if isinstance(new['ret'], (tuple, list)): respurl, title = new['ret'] elif isinstance(new['ret'], dict): respurl = new['ret']['url'] title = new['ret']['title'] else: raise RuntimeError("Don't know what the return type of `getHeadTitlePhantomJS` is! Type: %s" % type(new['ret'])) else: raise RuntimeError("Response to unknown call: %s!" % new) if respurl.endswith("?m=1"): respurl = respurl[:-len("?m=1")] self.log.info("Processing remote head response: %s", new) self.log.info("Resolved job to URL: %s", respurl) self.log.info("Page title: %s", title) # Handle the 301/2 not resolving properly. netloc = urllib.parse.urlsplit(respurl).netloc if "novelupdates" in netloc: self.log.warning("Failed to validate external URL. Either scraper is blocked, or phantomjs is failing.") return True if 'm.wuxiaworld.com' in respurl: respurl = respurl.replace('m.wuxiaworld.com', 'www.wuxiaworld.com') if 'tseirptranslations.blogspot.com' in respurl: respurl = respurl.replace('tseirptranslations.blogspot.com', 'tseirptranslations.com') if 'm.xianxiaworld.net' in respurl: respurl = respurl.replace('m.xianxiaworld.net', 'www.xianxiaworld.net') if 'shikkakutranslations.wordpress.com' in respurl: respurl = respurl.replace('shikkakutranslations.wordpress.com', 'shikkakutranslations.com') if any([tmp in respurl for tmp in BAD_RESOLVES]): self.log.warning("Bad resolve in url: '%s'. Not inserting into DB.", respurl) return True if not respurl.lower().startswith("http"): self.log.warning("URL '%s' does not start with 'http'. Not inserting into DB.", respurl) return True if '/?utm_source=feedburner' in respurl: respurl = respurl.split('/?utm_source=feedburner')[0] + "/" have = db_sess.query(db.NuReleaseItem) \ .options(joinedload('resolved')) \ .filter(db.NuReleaseItem.outbound_wrapper==new['extradat']['wrapper_url']) \ .filter(db.NuReleaseItem.referrer==new['extradat']['referrer']) \ .scalar() if not have: self.log.error("Base row deleted from resolve?") return if title.strip().lower() == respurl.strip().lower(): self.log.warning("Item didn't resolve to a name properly!") return new = db.NuResolvedOutbound( client_id = new['user'], client_key = new['user_uuid'], actual_target = respurl, resolved_title = title, fetched_on = datetime.datetime.now(), ) have.resolved.append(new) db_sess.commit() self.mon_con.incr('head-received', 1) return True except sqlalchemy.exc.InvalidRequestError: db_sess.rollback() except sqlalchemy.exc.OperationalError: db_sess.rollback() except sqlalchemy.exc.IntegrityError: db_sess.rollback() except Exception: self.mon_con.incr('head-failed', 1) self.log.error("Error when processing job response!") for line in traceback.format_exc().split("\n"): self.log.error(line) self.log.error("Contents of head response:") for line in pprint.pformat(new).split("\n"): self.log.error(line) return True return False
def process_single_avail(self): ''' Example response: { 'call': 'getHeadPhantomJS', 'cancontinue': True, 'dispatch_key': 'fetcher', 'extradat': {'mode': 'fetch'}, 'jobid': -1, 'jobmeta': {'sort_key': 'a269f164a16e11e6891500163ef6fe07'}, 'module': 'NUWebRequest', 'ret': 'http://lightnovels.world/the-nine-godheads/nine-godheads-chapter-74/', 'success': True, 'user': '******', 'user_uuid': 'urn:uuid:0a243518-834f-46d8-b34c-7f2afd20d37f' } ''' self.check_open_rpc_interface() errors = 0 while 1: try: new = self.rpc.get_job() break except TypeError: self.check_open_rpc_interface() except KeyError: self.check_open_rpc_interface() except bsonrpc.exceptions.BsonRpcError as e: errors += 1 self.check_open_rpc_interface() if errors > 3: raise e else: self.log.warning("Exception in RPC request:") for line in traceback.format_exc().split("\n"): self.log.warning(line) expected_keys = [ 'call', 'cancontinue', 'dispatch_key', 'extradat', 'jobid', 'jobmeta', 'module', 'ret', 'success', 'user', 'user_uuid' ] if new is None: self.log.info("No NU Head responses!") return False while True: try: assert all([key in new for key in expected_keys]) assert 'referrer' in new['extradat'] assert 'wrapper_url' in new['extradat'] if new['call'] == 'getHeadPhantomJS': respurl, title = new['ret'], "" elif new['call'] == 'getHeadTitlePhantomJS' or new[ 'call'] == 'getHeadTitleChromium': if isinstance(new['ret'], (tuple, list)): respurl, title = new['ret'] elif isinstance(new['ret'], dict): respurl = new['ret']['url'] title = new['ret']['title'] else: raise RuntimeError( "Don't know what the return type of `getHeadTitlePhantomJS` is! Type: %s" % type(new['ret'])) else: raise RuntimeError("Response to unknown call: %s!" % new) self.log.info("Processing remote head response: %s", new) self.log.info("Resolved job to URL: %s", respurl) self.log.info("Page title: %s", title) # Handle the 301/2 not resolving properly. netloc = urllib.parse.urlsplit(respurl).netloc if "novelupdates" in netloc: self.log.warning( "Failed to validate external URL. Either scraper is blocked, or phantomjs is failing." ) return True have = self.db_sess.query(db.NuReleaseItem) \ .options(joinedload('resolved')) \ .filter(db.NuReleaseItem.outbound_wrapper==new['extradat']['wrapper_url']) \ .filter(db.NuReleaseItem.referrer==new['extradat']['referrer']) \ .scalar() if not have: self.log.error("Base row deleted from resolve?") return new = db.NuResolvedOutbound( client_id=new['user'], client_key=new['user_uuid'], actual_target=respurl, resolved_title=title, fetched_on=datetime.datetime.now(), ) have.resolved.append(new) self.db_sess.commit() return True except sqlalchemy.exc.InvalidRequestError: self.db_sess.rollback() except sqlalchemy.exc.OperationalError: self.db_sess.rollback() except sqlalchemy.exc.IntegrityError: self.db_sess.rollback() except Exception: self.log.error("Error when processing job response!") for line in traceback.format_exc().split("\n"): self.log.error(line) self.log.error("Contents of head response:") for line in pprint.pformat(new).split("\n"): self.log.error(line) return True return False