def exposed_head(url, ref):
	'''
	Do a phantomjs HEAD request for url `url`, passing the referrer `ref`
	'''

	rpc_interface = common.get_rpyc.RemoteJobInterface("Test_Interface!")
	print('wat?')
	print(rpc_interface)


	raw_job = buildjob(
		module         = 'NUWebRequest',
		call           = 'getHeadPhantomJS',
		dispatchKey    = "fetcher",
		jobid          = -1,
		args           = [url, ref],
		kwargs         = {},
		additionalData = {'mode' : 'fetch'},
		postDelay      = 0,
		unique_id      = url
	)

	rpc_interface.put_job(raw_job)
	while True:
		try:
			resp = rpc_interface.get_job()
			print_response(resp)
			if not resp:
				time.sleep(1)

		except queue.Empty:
			print("No response yet?")
def exposed_head(url, ref):
	'''
	Do a phantomjs HEAD request for url `url`, passing the referrer `ref`
	'''

	rpc_interface = common.get_rpyc.RemoteJobInterface("Test_Interface!")
	print('wat?')
	print(rpc_interface)


	raw_job = buildjob(
		module         = 'NUWebRequest',
		call           = 'getHeadPhantomJS',
		dispatchKey    = "fetcher",
		jobid          = -1,
		args           = [url, ref],
		kwargs         = {},
		additionalData = {'mode' : 'fetch'},
		postDelay      = 0,
		unique_id      = url
	)

	rpc_interface.put_job(raw_job)
	while True:
		try:
			resp = rpc_interface.get_job()
			print_response(resp)
			if not resp:
				time.sleep(1)

		except queue.Empty:
			print("No response yet?")
Example #3
0
def main():
    logSetup.initLogging()

    remote = rpyc.connect("localhost",
                          12345,
                          config=rpyc.core.protocol.DEFAULT_CONFIG)

    raw_job = buildjob(module='WebRequest',
                       call='getItem',
                       dispatchKey="fetcher",
                       jobid=-1,
                       args=['http://www.google.com'],
                       kwargs={},
                       additionalData={'mode': 'fetch'},
                       postDelay=0)

    print(remote)
    print(remote.root.putJob('wat', raw_job))

    while 1:
        try:
            j = remote.root.getJob("wat")
            print("Got job!")
        except queue.Empty:
            time.sleep(1)
            print("No message")
        except rpyc.core.vinegar.GenericException as e:
            # this is horrible
            if 'queue.Empty' in rpyc.core.vinegar._generic_exceptions_cache:
                if isinstance(
                        e, rpyc.core.vinegar.
                        _generic_exceptions_cache['queue.Empty']):
                    print("Empty exception")
                    continue

            print("type", type(e))
            print("instance", issubclass(type(e), queue.Empty))

            import inspect
            print(inspect.getmro(type(e)))
            # extp = rpyc.core.vinegar._get_exception_class(queue.Empty)
            # print(extp)
            # print("instance", isinstance(e, extp))
            # print("instance", isinstance(type(e), extp))
            # print("type", type(extp()))
            fakemodule = {
                "__module__": "%s/%s" % ("rpyc.core.vinegar", "queue")
            }
            extp = type("queue.Empty", (rpyc.core.vinegar.GenericException, ),
                        fakemodule)
            print(extp)
            print(isinstance(e, extp))
            print(isinstance(e, rpyc.core.vinegar.GenericException))

            print(rpyc.core.vinegar._generic_exceptions_cache)
            raise e

    remote.close()
Example #4
0
def main():
	logSetup.initLogging()

	remote = rpyc.connect("localhost", 12345, config = rpyc.core.protocol.DEFAULT_CONFIG)


	raw_job = buildjob(
		module         = 'WebRequest',
		call           = 'getItem',
		dispatchKey    = "fetcher",
		jobid          = -1,
		args           = ['http://www.google.com'],
		kwargs         = {},
		additionalData = {'mode' : 'fetch'},
		postDelay      = 0
	)

	print(remote)
	print(remote.root.putJob('wat', raw_job))

	while 1:
		try:
			j = remote.root.getJob("wat")
			print("Got job!")
		except queue.Empty:
			time.sleep(1)
			print("No message")
		except rpyc.core.vinegar.GenericException as e:
			# this is horrible
			if 'queue.Empty' in rpyc.core.vinegar._generic_exceptions_cache:
				if isinstance(e, rpyc.core.vinegar._generic_exceptions_cache['queue.Empty']):
					print("Empty exception")
					continue

			print("type", type(e))
			print("instance", issubclass(type(e), queue.Empty))

			import inspect
			print(inspect.getmro(type(e)))
			# extp = rpyc.core.vinegar._get_exception_class(queue.Empty)
			# print(extp)
			# print("instance", isinstance(e, extp))
			# print("instance", isinstance(type(e), extp))
			# print("type", type(extp()))
			fakemodule = {"__module__" : "%s/%s" % ("rpyc.core.vinegar", "queue")}
			extp = type("queue.Empty", (rpyc.core.vinegar.GenericException,), fakemodule)
			print(extp)
			print(isinstance(e, extp))
			print(isinstance(e, rpyc.core.vinegar.GenericException))


			print(rpyc.core.vinegar._generic_exceptions_cache )
			raise e

	remote.close()
Example #5
0
def main():
	import logSetup
	from WebMirror.NewJobQueue import buildjob
	logSetup.initLogging()

	raw_job = buildjob(
		module         = 'WebRequest',
		call           = 'getItem',
		dispatchKey    = "fetcher",
		jobid          = -1,
		args           = ['http://www.google.com'],
		kwargs         = {},
		additionalData = {'mode' : 'fetch'},
		postDelay      = 0
	)

	rint = RemoteJobInterface("wat")
	print(rint.put_job(raw_job))
	print(rint)
	while 1:
		try:
			j = rint.get_job()
			if j:
				print("Got job!", j)
		except queue.Empty:
			time.sleep(1)
			print("No message")
		except Exception as e:
		# except pyjsonrpc.JsonRpcError as err:
			print("type", type(e))
			print("instance", issubclass(type(e), queue.Empty))

			import inspect
			print(inspect.getmro(type(e)))

			raise e

	remote.close()
Example #6
0
def main():
    import logSetup
    from WebMirror.NewJobQueue import buildjob
    logSetup.initLogging()

    raw_job = buildjob(module='WebRequest',
                       call='getItem',
                       dispatchKey="fetcher",
                       jobid=-1,
                       args=['http://www.google.com'],
                       kwargs={},
                       additionalData={'mode': 'fetch'},
                       postDelay=0)

    rint = RemoteJobInterface("wat")
    print(rint.put_job(raw_job))
    print(rint)
    while 1:
        try:
            j = rint.get_job()
            if j:
                print("Got job!", j)
        except queue.Empty:
            time.sleep(1)
            print("No message")
        except Exception as e:
            # except pyjsonrpc.JsonRpcError as err:
            print("type", type(e))
            print("instance", issubclass(type(e), queue.Empty))

            import inspect
            print(inspect.getmro(type(e)))

            raise e

    remote.close()
Example #7
0
	def put_job(self, put=3):
		self.log.info("Loading a row to fetch...")
		# haveq = self.db_sess.query(db.NuReleaseItem)                   \
		# 	.outerjoin(db.NuResolvedOutbound)                         \
		# 	.filter(db.NuReleaseItem.validated == False)              \
		# 	.having(func.count(db.NuResolvedOutbound.parent) < 3)     \
		# 	.order_by(func.random())                                  \
		# 	.group_by(db.NuReleaseItem.id)                            \
		# 	.limit(max(100, put*3))

		# # .order_by(desc(db.NuReleaseItem.first_seen))                \
		# moar = haveq.all()


		self.log.info("Loading a row to fetch...")
		haveq = self.db_sess.query(db.NuReleaseItem)                   \
			.outerjoin(db.NuResolvedOutbound)                         \
			.filter(db.NuReleaseItem.validated == False)              \
			.having(func.count(db.NuResolvedOutbound.parent) < 3)     \
			.order_by(desc(db.NuReleaseItem.first_seen))              \
			.group_by(db.NuReleaseItem.id)                            \
			.limit(max(100, put*3))

		haveset = haveq.all()




		# haveset += moar

		if not haveset:
			self.log.info("No jobs to remote HEAD.")
			return

		# We pick a large number of items, and randomly choose one of them.
		# This lets us weight the fetch preferentially to the recent items, but still
		# have some variability.

		haveset = random.sample(haveset, min(put, len(haveset)))

		for have in haveset:
			if len(list(have.resolved)) >= 3:
				raise RuntimeError("Overresolved item that's not valid.")

			if (have.referrer == "http://www.novelupdates.com" or
				have.referrer == "https://www.novelupdates.com" or
				have.referrer == "https://www.novelupdates.com/" or
				have.referrer == "http://www.novelupdates.com/"):
				self.log.error("Wat?")
				self.log.error("Bad Referrer URL got into the input queue!")
				self.log.error("Id: %s", have.id)
				continue

			self.log.info("Putting job for url '%s'", have.outbound_wrapper)
			self.log.info("Referring page '%s'", have.referrer)
			raw_job = buildjob(
				module         = 'NUWebRequest',
				call           = 'getHeadTitlePhantomJS',
				dispatchKey    = "fetcher",
				jobid          = -1,
				args           = [have.outbound_wrapper, have.referrer],
				kwargs         = {},
				additionalData = {
					'mode'        : 'fetch',
					'wrapper_url' : have.outbound_wrapper,
					'referrer'    : have.referrer
					},
				postDelay      = 0,
				unique_id      = have.outbound_wrapper
			)

			self.rpc.put_job(raw_job)
Example #8
0
    def put_job(self, put=3):
        self.log.info("Loading rows to fetch...")
        recent_d = datetime.datetime.now() - datetime.timedelta(hours=72)
        recentq = self.db_sess.query(db.NuReleaseItem)                \
         .outerjoin(db.NuResolvedOutbound)                         \
         .filter(db.NuReleaseItem.validated == False)              \
         .filter(db.NuReleaseItem.first_seen >= recent_d)          \
         .having(func.count(db.NuResolvedOutbound.parent) < 3)     \
         .order_by(desc(db.NuReleaseItem.first_seen))              \
         .group_by(db.NuReleaseItem.id)                            \
         .limit(max(100, put*3))


        bulkq = self.db_sess.query(db.NuReleaseItem)                  \
         .outerjoin(db.NuResolvedOutbound)                         \
         .filter(db.NuReleaseItem.validated == False)              \
         .having(func.count(db.NuResolvedOutbound.parent) < 3)     \
         .order_by(desc(db.NuReleaseItem.first_seen))              \
         .group_by(db.NuReleaseItem.id)                            \
         .limit(max(100, put))

        bulkset = bulkq.all()
        recentset = recentq.all()

        self.log.info("Have %s recent items, %s long-term items to fetch",
                      len(recentset), len(bulkset))
        haveset = bulkset + recentset
        # haveset   = recentset

        # haveset += moar

        if not haveset:
            self.log.info("No jobs to remote HEAD.")
            return

        # We pick a large number of items, and randomly choose one of them.
        # This lets us weight the fetch preferentially to the recent items, but still
        # have some variability.

        haveset = random.sample(haveset, min(put, len(haveset)))

        for have in haveset:
            if len(list(have.resolved)) >= 3:
                raise RuntimeError("Overresolved item that's not valid.")

            if (have.referrer == "http://www.novelupdates.com"
                    or have.referrer == "https://www.novelupdates.com"
                    or have.referrer == "https://www.novelupdates.com/"
                    or have.referrer == "http://www.novelupdates.com/"):
                self.log.error("Wat?")
                self.log.error("Bad Referrer URL got into the input queue!")
                self.log.error("Id: %s", have.id)
                continue

            self.log.info("Putting job for url '%s'", have.outbound_wrapper)
            self.log.info("Referring page '%s'", have.referrer)

            raw_job = buildjob(
                module='WebRequest',
                call='getHeadTitleChromium',
                dispatchKey="fetcher",
                jobid=-1,
                args=[have.outbound_wrapper, have.referrer],
                kwargs={},
                additionalData={
                    'mode': 'fetch',
                    'wrapper_url': have.outbound_wrapper,
                    'referrer': have.referrer
                },
                postDelay=0,
                unique_id=have.outbound_wrapper,
                serialize=True,
            )

            # rval = random.random()
            # if rval >= 0.5:
            # 	raw_job = buildjob(
            # 		module         = 'NUWebRequest',
            # 		call           = 'getHeadTitlePhantomJS',
            # 		dispatchKey    = "fetcher",
            # 		jobid          = -1,
            # 		args           = [have.outbound_wrapper, have.referrer],
            # 		kwargs         = {},
            # 		additionalData = {
            # 			'mode'        : 'fetch',
            # 			'wrapper_url' : have.outbound_wrapper,
            # 			'referrer'    : have.referrer
            # 			},
            # 		postDelay      = 0,
            # 		unique_id      = have.outbound_wrapper
            # 	)
            # else:
            # 	raw_job = buildjob(
            # 		module         = 'WebRequest',
            # 		call           = 'getHeadTitleChromium',
            # 		dispatchKey    = "fetcher",
            # 		jobid          = -1,
            # 		args           = [have.outbound_wrapper, have.referrer],
            # 		kwargs         = {},
            # 		additionalData = {
            # 			'mode'        : 'fetch',
            # 			'wrapper_url' : have.outbound_wrapper,
            # 			'referrer'    : have.referrer
            # 			},
            # 		postDelay      = 0,
            # 		unique_id      = have.outbound_wrapper
            # 	)

            self.rpc.put_job(raw_job)