def exposed_head(url, ref): ''' Do a phantomjs HEAD request for url `url`, passing the referrer `ref` ''' rpc_interface = common.get_rpyc.RemoteJobInterface("Test_Interface!") print('wat?') print(rpc_interface) raw_job = buildjob( module = 'NUWebRequest', call = 'getHeadPhantomJS', dispatchKey = "fetcher", jobid = -1, args = [url, ref], kwargs = {}, additionalData = {'mode' : 'fetch'}, postDelay = 0, unique_id = url ) rpc_interface.put_job(raw_job) while True: try: resp = rpc_interface.get_job() print_response(resp) if not resp: time.sleep(1) except queue.Empty: print("No response yet?")
def main(): logSetup.initLogging() remote = rpyc.connect("localhost", 12345, config=rpyc.core.protocol.DEFAULT_CONFIG) raw_job = buildjob(module='WebRequest', call='getItem', dispatchKey="fetcher", jobid=-1, args=['http://www.google.com'], kwargs={}, additionalData={'mode': 'fetch'}, postDelay=0) print(remote) print(remote.root.putJob('wat', raw_job)) while 1: try: j = remote.root.getJob("wat") print("Got job!") except queue.Empty: time.sleep(1) print("No message") except rpyc.core.vinegar.GenericException as e: # this is horrible if 'queue.Empty' in rpyc.core.vinegar._generic_exceptions_cache: if isinstance( e, rpyc.core.vinegar. _generic_exceptions_cache['queue.Empty']): print("Empty exception") continue print("type", type(e)) print("instance", issubclass(type(e), queue.Empty)) import inspect print(inspect.getmro(type(e))) # extp = rpyc.core.vinegar._get_exception_class(queue.Empty) # print(extp) # print("instance", isinstance(e, extp)) # print("instance", isinstance(type(e), extp)) # print("type", type(extp())) fakemodule = { "__module__": "%s/%s" % ("rpyc.core.vinegar", "queue") } extp = type("queue.Empty", (rpyc.core.vinegar.GenericException, ), fakemodule) print(extp) print(isinstance(e, extp)) print(isinstance(e, rpyc.core.vinegar.GenericException)) print(rpyc.core.vinegar._generic_exceptions_cache) raise e remote.close()
def main(): logSetup.initLogging() remote = rpyc.connect("localhost", 12345, config = rpyc.core.protocol.DEFAULT_CONFIG) raw_job = buildjob( module = 'WebRequest', call = 'getItem', dispatchKey = "fetcher", jobid = -1, args = ['http://www.google.com'], kwargs = {}, additionalData = {'mode' : 'fetch'}, postDelay = 0 ) print(remote) print(remote.root.putJob('wat', raw_job)) while 1: try: j = remote.root.getJob("wat") print("Got job!") except queue.Empty: time.sleep(1) print("No message") except rpyc.core.vinegar.GenericException as e: # this is horrible if 'queue.Empty' in rpyc.core.vinegar._generic_exceptions_cache: if isinstance(e, rpyc.core.vinegar._generic_exceptions_cache['queue.Empty']): print("Empty exception") continue print("type", type(e)) print("instance", issubclass(type(e), queue.Empty)) import inspect print(inspect.getmro(type(e))) # extp = rpyc.core.vinegar._get_exception_class(queue.Empty) # print(extp) # print("instance", isinstance(e, extp)) # print("instance", isinstance(type(e), extp)) # print("type", type(extp())) fakemodule = {"__module__" : "%s/%s" % ("rpyc.core.vinegar", "queue")} extp = type("queue.Empty", (rpyc.core.vinegar.GenericException,), fakemodule) print(extp) print(isinstance(e, extp)) print(isinstance(e, rpyc.core.vinegar.GenericException)) print(rpyc.core.vinegar._generic_exceptions_cache ) raise e remote.close()
def main(): import logSetup from WebMirror.NewJobQueue import buildjob logSetup.initLogging() raw_job = buildjob( module = 'WebRequest', call = 'getItem', dispatchKey = "fetcher", jobid = -1, args = ['http://www.google.com'], kwargs = {}, additionalData = {'mode' : 'fetch'}, postDelay = 0 ) rint = RemoteJobInterface("wat") print(rint.put_job(raw_job)) print(rint) while 1: try: j = rint.get_job() if j: print("Got job!", j) except queue.Empty: time.sleep(1) print("No message") except Exception as e: # except pyjsonrpc.JsonRpcError as err: print("type", type(e)) print("instance", issubclass(type(e), queue.Empty)) import inspect print(inspect.getmro(type(e))) raise e remote.close()
def main(): import logSetup from WebMirror.NewJobQueue import buildjob logSetup.initLogging() raw_job = buildjob(module='WebRequest', call='getItem', dispatchKey="fetcher", jobid=-1, args=['http://www.google.com'], kwargs={}, additionalData={'mode': 'fetch'}, postDelay=0) rint = RemoteJobInterface("wat") print(rint.put_job(raw_job)) print(rint) while 1: try: j = rint.get_job() if j: print("Got job!", j) except queue.Empty: time.sleep(1) print("No message") except Exception as e: # except pyjsonrpc.JsonRpcError as err: print("type", type(e)) print("instance", issubclass(type(e), queue.Empty)) import inspect print(inspect.getmro(type(e))) raise e remote.close()
def put_job(self, put=3): self.log.info("Loading a row to fetch...") # haveq = self.db_sess.query(db.NuReleaseItem) \ # .outerjoin(db.NuResolvedOutbound) \ # .filter(db.NuReleaseItem.validated == False) \ # .having(func.count(db.NuResolvedOutbound.parent) < 3) \ # .order_by(func.random()) \ # .group_by(db.NuReleaseItem.id) \ # .limit(max(100, put*3)) # # .order_by(desc(db.NuReleaseItem.first_seen)) \ # moar = haveq.all() self.log.info("Loading a row to fetch...") haveq = self.db_sess.query(db.NuReleaseItem) \ .outerjoin(db.NuResolvedOutbound) \ .filter(db.NuReleaseItem.validated == False) \ .having(func.count(db.NuResolvedOutbound.parent) < 3) \ .order_by(desc(db.NuReleaseItem.first_seen)) \ .group_by(db.NuReleaseItem.id) \ .limit(max(100, put*3)) haveset = haveq.all() # haveset += moar if not haveset: self.log.info("No jobs to remote HEAD.") return # We pick a large number of items, and randomly choose one of them. # This lets us weight the fetch preferentially to the recent items, but still # have some variability. haveset = random.sample(haveset, min(put, len(haveset))) for have in haveset: if len(list(have.resolved)) >= 3: raise RuntimeError("Overresolved item that's not valid.") if (have.referrer == "http://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com/" or have.referrer == "http://www.novelupdates.com/"): self.log.error("Wat?") self.log.error("Bad Referrer URL got into the input queue!") self.log.error("Id: %s", have.id) continue self.log.info("Putting job for url '%s'", have.outbound_wrapper) self.log.info("Referring page '%s'", have.referrer) raw_job = buildjob( module = 'NUWebRequest', call = 'getHeadTitlePhantomJS', dispatchKey = "fetcher", jobid = -1, args = [have.outbound_wrapper, have.referrer], kwargs = {}, additionalData = { 'mode' : 'fetch', 'wrapper_url' : have.outbound_wrapper, 'referrer' : have.referrer }, postDelay = 0, unique_id = have.outbound_wrapper ) self.rpc.put_job(raw_job)
def put_job(self, put=3): self.log.info("Loading rows to fetch...") recent_d = datetime.datetime.now() - datetime.timedelta(hours=72) recentq = self.db_sess.query(db.NuReleaseItem) \ .outerjoin(db.NuResolvedOutbound) \ .filter(db.NuReleaseItem.validated == False) \ .filter(db.NuReleaseItem.first_seen >= recent_d) \ .having(func.count(db.NuResolvedOutbound.parent) < 3) \ .order_by(desc(db.NuReleaseItem.first_seen)) \ .group_by(db.NuReleaseItem.id) \ .limit(max(100, put*3)) bulkq = self.db_sess.query(db.NuReleaseItem) \ .outerjoin(db.NuResolvedOutbound) \ .filter(db.NuReleaseItem.validated == False) \ .having(func.count(db.NuResolvedOutbound.parent) < 3) \ .order_by(desc(db.NuReleaseItem.first_seen)) \ .group_by(db.NuReleaseItem.id) \ .limit(max(100, put)) bulkset = bulkq.all() recentset = recentq.all() self.log.info("Have %s recent items, %s long-term items to fetch", len(recentset), len(bulkset)) haveset = bulkset + recentset # haveset = recentset # haveset += moar if not haveset: self.log.info("No jobs to remote HEAD.") return # We pick a large number of items, and randomly choose one of them. # This lets us weight the fetch preferentially to the recent items, but still # have some variability. haveset = random.sample(haveset, min(put, len(haveset))) for have in haveset: if len(list(have.resolved)) >= 3: raise RuntimeError("Overresolved item that's not valid.") if (have.referrer == "http://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com" or have.referrer == "https://www.novelupdates.com/" or have.referrer == "http://www.novelupdates.com/"): self.log.error("Wat?") self.log.error("Bad Referrer URL got into the input queue!") self.log.error("Id: %s", have.id) continue self.log.info("Putting job for url '%s'", have.outbound_wrapper) self.log.info("Referring page '%s'", have.referrer) raw_job = buildjob( module='WebRequest', call='getHeadTitleChromium', dispatchKey="fetcher", jobid=-1, args=[have.outbound_wrapper, have.referrer], kwargs={}, additionalData={ 'mode': 'fetch', 'wrapper_url': have.outbound_wrapper, 'referrer': have.referrer }, postDelay=0, unique_id=have.outbound_wrapper, serialize=True, ) # rval = random.random() # if rval >= 0.5: # raw_job = buildjob( # module = 'NUWebRequest', # call = 'getHeadTitlePhantomJS', # dispatchKey = "fetcher", # jobid = -1, # args = [have.outbound_wrapper, have.referrer], # kwargs = {}, # additionalData = { # 'mode' : 'fetch', # 'wrapper_url' : have.outbound_wrapper, # 'referrer' : have.referrer # }, # postDelay = 0, # unique_id = have.outbound_wrapper # ) # else: # raw_job = buildjob( # module = 'WebRequest', # call = 'getHeadTitleChromium', # dispatchKey = "fetcher", # jobid = -1, # args = [have.outbound_wrapper, have.referrer], # kwargs = {}, # additionalData = { # 'mode' : 'fetch', # 'wrapper_url' : have.outbound_wrapper, # 'referrer' : have.referrer # }, # postDelay = 0, # unique_id = have.outbound_wrapper # ) self.rpc.put_job(raw_job)