def put_outbound_callable(self, jobid, serialized, meta={}, call_kwargs={}, early_ack=False, job_unique_id=None): self.log.info("Dispatching new callable job") call_kwargs_out = {'code_struct': serialized} for key, value in call_kwargs.items(): call_kwargs_out[key] = value raw_job = buildjob( module='RemoteExec', call='callCode', dispatchKey="rwp-rpc-system", jobid=jobid, kwargs=call_kwargs_out, additionalData=meta, postDelay=0, early_ack=early_ack, serialize=self.pluginName, unique_id=job_unique_id, ) self.put_outbound_raw(raw_job)
def exposed_head(url, ref): ''' Do a phantomjs HEAD request for url `url`, passing the referrer `ref` ''' rpc_interface = common.get_rpyc.RemoteJobInterface("Test_Interface!") print('wat?') print(rpc_interface) raw_job = buildjob(module='SmartWebRequest', call='getHeadTitleChromium', dispatchKey="fetcher", jobid=-1, args=[url, ref], kwargs={}, additionalData={'mode': 'fetch'}, postDelay=0, unique_id=url) rpc_interface.put_job(raw_job) while True: try: resp = rpc_interface.get_job() print_response(resp) if not resp: time.sleep(1) except queue.Empty: print("No response yet?")
def __blocking_dispatch_call_local(self, remote_cls, call_kwargs, meta=None, expect_partials=False): self.log.info("Dispatching new callable job to local executor") print("Kwargs:", call_kwargs) scls = rpc_serialize.serialize_class(remote_cls) call_kwargs_out = {'code_struct': scls} for key, value in call_kwargs.items(): call_kwargs_out[key] = value # job = { # 'call' : 'callCode', # 'module' : 'RemoteExec', # 'args' : (), # 'kwargs' : call_kwargs_out, # 'extradat' : meta, # 'dispatch_key' : "rpc-system", # 'response_routing_key' : 'response' # } print(local_exec) print(dir(local_exec)) jid = self.job_counter self.job_counter += 1 raw_job = buildjob( module='RemoteExec', call='callCode', dispatchKey="rwp-rpc-system", jobid=jid, kwargs=call_kwargs_out, additionalData=meta, postDelay=0, early_ack=False, serialize=self.pluginName, unique_id=None, ) rpc_interface = common.get_rpyc.RemoteFetchInterface() rpc_interface.check_ok() ret = rpc_interface.dispatch_request(raw_job) rpc_interface.close() ret['jobid'] = jid ret = self.process_response_items([jid], expect_partials, preload_rets=[ret]) if not expect_partials: ret = next(ret) return ret
def put_outbound_fetch_job(self, jobid, joburl): self.log.info("Dispatching new fetch job") raw_job = buildjob(module='WebRequest', call='getItem', dispatchKey="rwp-rpc-system", jobid=jobid, args=[joburl], kwargs={}, additionalData={'mode': 'fetch'}, postDelay=0) self.put_outbound_raw(raw_job)
def exposed_test_chromium_fetch(): ''' Run a test-fetch with the chromium remote rendering system ''' print("Chromium Test") rpc_interface = common.get_rpyc.RemoteJobInterface("TestInterface") rpc_interface.check_ok() print("RPC:", rpc_interface) print("Dispatching job engine") raw_job_1 = buildjob(module='NUWebRequest', call='getHeadTitlePhantomJS', dispatchKey="lolwattttt", jobid="lolwat", args=['http://www.google.com', 'http://www.goat.com'], kwargs={}, additionalData={'herp': 'derp'}, postDelay=0) raw_job_2 = buildjob(module='WebRequest', call='getHeadTitleChromium', dispatchKey="lolwattttt", jobid="lolwat", args=[], kwargs={ 'url': 'http://www.google.com', 'referrer': 'http://www.goat.com' }, additionalData={'herp': 'derp'}, postDelay=0) raw_job_3 = buildjob(module='WebRequest', call='getItemChromium', dispatchKey="lolwattttt", jobid="lolwat", args=[], kwargs={'itemUrl': 'http://www.google.com'}, additionalData={'herp': 'derp'}, postDelay=0) raw_job_4 = buildjob( module='WebRequest', call='getItem', dispatchKey="lolwattttt", jobid="lolwat", args=[], kwargs={ 'itemUrl': 'http://imgsv.imaging.nikon.com/lineup/dslr/d600/img/sample01/img_01_l.jpg' }, additionalData={'herp': 'derp'}, postDelay=0) # rpc_interface.put_job(raw_job_1) # rpc_interface.put_job(raw_job_2) rpc_interface.put_job(raw_job_3) rpc_interface.put_job(raw_job_4) for _ in range(60 * 15): try: tmp = rpc_interface.get_job() if tmp: print("response!") dump_response(tmp) else: print("No tmp:", tmp) time.sleep(1) except queue.Empty: time.sleep(1)
def put_job(self, put=3): with db.session_context() as db_sess: self.log.info("Loading rows to fetch..") recent_d = datetime.datetime.now() - datetime.timedelta(hours=72) recentq = db_sess.query(db.NuReleaseItem) \ .outerjoin(db.NuResolvedOutbound) \ .filter(db.NuReleaseItem.validated == False) \ .filter(db.NuReleaseItem.first_seen >= recent_d) \ .options(joinedload('resolved')) \ .order_by(desc(db.NuReleaseItem.first_seen)) \ .group_by(db.NuReleaseItem.id) \ .limit(max(100, put*10)) bulkq = db_sess.query(db.NuReleaseItem) \ .outerjoin(db.NuResolvedOutbound) \ .filter(db.NuReleaseItem.validated == False) \ .options(joinedload('resolved')) \ .order_by(desc(db.NuReleaseItem.first_seen)) \ .group_by(db.NuReleaseItem.id) \ .limit(max(100, put*6)) bulkset = bulkq.all() recentset = recentq.all() self.log.info("Have %s recent items, %s long-term items to fetch", len(recentset), len(bulkset)) haveset = bulkset + recentset filtered = {tmp.id : tmp for tmp in haveset} haveset = list(filtered.values()) self.log.info("Total items after filtering for uniqueness %s", len(haveset)) if not haveset: self.log.info("No jobs to remote HEAD.") return # We pick a large number of items, and randomly choose one of them. # This lets us weight the fetch preferentially to the recent items, but still # have some variability. # We prefer to fetch items that'll resolve as fast as possible. preferred_2 = [tmp for tmp in haveset if len(tmp.resolved) == 2] preferred_1 = [tmp for tmp in haveset if len(tmp.resolved) == 1] fallback = [tmp for tmp in haveset if len(tmp.resolved) == 0] haveset = random.sample(preferred_2, min(put, len(preferred_2))) if len(haveset) < put: haveset.extend(random.sample(preferred_1, min(put-len(haveset), len(preferred_1)))) if len(haveset) < put: haveset.extend(random.sample(fallback, min(put-len(haveset), len(fallback)))) put = 0 active = set() for have in haveset: if len(list(have.resolved)) >= 3: raise RuntimeError("Overresolved item that's not valid.") if (have.referrer == "http://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com/" or have.referrer == "http://www.novelupdates.com/"): self.log.error("Wat?") self.log.error("Bad Referrer URL got into the input queue!") self.log.error("Id: %s, ref: %s", have.id, have.referrer) for bad_resolve in have.resolved: db_sess.delete(bad_resolve) db_sess.delete(have) db_sess.commit() continue if have.fetch_attempts > MAX_TOTAL_FETCH_ATTEMPTS: self.log.error("Wat?") self.log.error("Item fetched too many times!") self.log.error("Id: %s", have.id) self.log.error("Attempted more then %s resolves. Disabling.", MAX_TOTAL_FETCH_ATTEMPTS) have.reviewed = 'rejected' have.validated = True db_sess.commit() continue if have.outbound_wrapper in active: continue active.add(have.outbound_wrapper) have.fetch_attempts += 1 db_sess.commit() self.log.info("Putting job for url '%s', with %s resolves so far", have.outbound_wrapper, len(have.resolved)) self.log.info("Referring page '%s'", have.referrer) raw_job = buildjob( module = 'WebRequest', call = 'getHeadTitleChromium', dispatchKey = "fetcher", jobid = -1, args = [], kwargs = { "url" : have.outbound_wrapper, "referrer" : have.referrer, "title_timeout" : 30, }, additionalData = { 'mode' : 'fetch', 'wrapper_url' : have.outbound_wrapper, 'referrer' : have.referrer }, postDelay = 0, unique_id = have.outbound_wrapper, serialize = 'Nu-Header', ) self.rpc.put_job(raw_job) put += 1 return put