def __load_referrer(self, base_url, item):

        if any([tmp.client_id == 'local' for tmp in item.resolved]):
            return

        # Don't dewaf
        old_autowaf = self.kwargs['wg_proxy']().rules['auto_waf']
        self.kwargs['wg_proxy']().rules['auto_waf'] = False
        content, name, mime, url = self.kwargs['wg_proxy'](
        ).getFileNameMimeUrl(item.outbound_wrapper,
                             addlHeaders={'Referer': base_url})
        self.kwargs['wg_proxy']().rules['auto_waf'] = old_autowaf

        if url.startswith("https://www.novelupdates.com/extnu/"):
            raise RuntimeError("Failure when extracting NU referrer!")

        item.validated = True
        item.validated_on = datetime.datetime.now()
        item.actual_target = url
        item.reviewed = "manual_validate"

        pg_title = title_from_html(content)

        new = db.NuResolvedOutbound(
            client_id="local",
            client_key="local",
            actual_target=url,
            resolved_title=pg_title,
            fetched_on=datetime.datetime.now(),
        )

        item.resolved.append(new)
        self.db_sess.commit()

        self.log.info("TL Group: %s. Series %s, chap: %s", item.groupinfo,
                      item.seriesname, item.releaseinfo)
        self.log.info("URL '%s' resolved to '%s'", item.outbound_wrapper,
                      item.actual_target)
        self.log.info("Page title: '%s'", pg_title)
        sleep = random.triangular(3, 10, 30)
        self.log.info("Sleeping %s", sleep)
        time.sleep(sleep)
def cross_sync(increment):

	sess = db.get_db_session()
	print("Loading extant rows...")
	old_nu_items = sess.query(db.NuOutboundWrapperMap).order_by(desc(db.NuOutboundWrapperMap.id)).all()
	print("Loaded. Processing")
	have_count = 0
	new_count  = 0
	loops = 0
	nc_loops = 0
	try:

		for old_nu in old_nu_items:
			have = sess.query(db.NuReleaseItem)                                     \
				.options(joinedload('resolved'))                                    \
				.filter(db.NuReleaseItem.outbound_wrapper==old_nu.outbound_wrapper) \
				.scalar()
			if not have:
				have = db.NuReleaseItem(
						validated        = old_nu.validated,
						seriesname       = old_nu.seriesname,
						releaseinfo      = old_nu.releaseinfo,
						groupinfo        = old_nu.groupinfo,
						referrer         = old_nu.referrer,
						outbound_wrapper = old_nu.outbound_wrapper,
						first_seen       = old_nu.released_on,
						actual_target    = old_nu.actual_target,
					)
				sess.add(have)
				loops += 1
				new_count += 1


			old_key = (old_nu.client_id, old_nu.client_key, old_nu.actual_target)
			resolved = set([(itm.client_id, itm.client_key, itm.actual_target) for itm in have.resolved])
			if not old_key in resolved:
				new = db.NuResolvedOutbound(
						client_id      = old_nu.client_id,
						client_key     = old_nu.client_key,
						actual_target  = old_nu.actual_target,
						fetched_on     = old_nu.released_on,
					)
				have.resolved.append(new)
				loops += 1
				new_count += 1
			else:
				have_count += 1
				nc_loops += 1

			if loops > increment:
				print("Commit! Have {}, new {} ({}, {})".format(have_count, new_count, loops, nc_loops))
				sess.commit()
				loops = 0

			if nc_loops > 100:
				print("Have {}, new {} ({}, {})".format(have_count, new_count, loops, nc_loops))
				nc_loops = 0
		sess.commit()
	except Exception:
		sess.rollback()
		raise
Exemple #3
0
	def process_single_source_target_mapping(
				self,
				from_outbound_wrapper,
				from_outbound_referrer,
				to_url,
				to_url_title,
				client_id,
				client_key
			):

		if to_url.endswith("?m=1"):
			to_url = to_url[:-len("?m=1")]
		if "?utm_source=" in to_url:
			to_url = to_url.split("?utm_source=")[0]



		self.log.info("Processing remote head response: %s", (from_outbound_wrapper, from_outbound_referrer))
		self.log.info("Resolved job to URL: %s", to_url)
		self.log.info("Page title: %s", to_url_title)



		# Handle the 301/2 not resolving properly.
		netloc = urllib.parse.urlsplit(to_url).netloc
		if "novelupdates" in netloc:
			self.mon_con.incr('head-failed', 1)
			self.log.warning("Failed to validate external URL. Either scraper is blocked, or phantomjs is failing.")
			return True

		if '/m.wuxiaworld.com' in to_url:
			to_url = to_url.replace('/m.wuxiaworld.com', '/www.wuxiaworld.com')

		# So dumb
		if '/foruwww.wuxiaworld.com' in to_url:
			to_url = to_url.replace('/foruwww.wuxiaworld.com', '/forum.wuxiaworld.com')

		if 'tseirptranslations.blogspot.com' in to_url:
			to_url = to_url.replace('tseirptranslations.blogspot.com', 'tseirptranslations.com')
		if 'babelnovel.com/rssbooks/' in to_url:
			to_url = to_url.replace('babelnovel.com/rssbooks/', 'babelnovel.com/books/')
		if 'm.xianxiaworld.net' in to_url:
			to_url = to_url.replace('m.xianxiaworld.net', 'www.xianxiaworld.net')
		if 'shikkakutranslations.wordpress.com' in to_url:
			to_url = to_url.replace('shikkakutranslations.wordpress.com', 'shikkakutranslations.com')

		if any([tmp in to_url for tmp in BAD_RESOLVES]):
			self.log.warning("Bad resolve in url: '%s'. Not inserting into DB.", to_url)
			self.mon_con.incr('head-failed', 1)
			return True

		if not to_url.lower().startswith("http"):
			self.log.warning("URL '%s' does not start with 'http'. Not inserting into DB.", to_url)
			self.mon_con.incr('head-failed', 1)
			return True



		to_url_title = self.check_resolve_locally(to_url, to_url_title)



		if '/?utm_source=feedburner' in to_url:
			to_url = to_url.split('/?utm_source=feedburner')[0] + "/"

		while True:
			with db.session_context() as db_sess:
				try:
					self.log.info("Trying for upsert")
					have = db_sess.query(db.NuReleaseItem)                                   \
						.options(joinedload('resolved'))                                     \
						.filter(db.NuReleaseItem.outbound_wrapper == from_outbound_wrapper)  \
						.filter(db.NuReleaseItem.referrer         == from_outbound_referrer) \
						.scalar()

					if not have:
						self.log.error("Base row deleted from resolve?")
						self.mon_con.incr('head-failed', 1)
						return

					if to_url_title is None:
						self.log.warning("Item didn't resolve to a name properly!")
						self.mon_con.incr('head-failed', 1)
						return

					if to_url_title.strip().lower() == to_url.strip().lower():
						self.log.warning("Item didn't resolve to a name properly!")
						self.mon_con.incr('head-failed', 1)
						return

					new = db.NuResolvedOutbound(
							client_id      = client_id,
							client_key     = client_key,
							actual_target  = to_url,
							resolved_title = to_url_title,
							fetched_on     = datetime.datetime.now(),
						)

					have.resolved.append(new)
					db_sess.commit()
					self.mon_con.incr('head-received', 1)
					return False

				except sqlalchemy.exc.InvalidRequestError as e:
					self.log.error("Exception: %s!", e)

					db_sess.rollback()
				except sqlalchemy.exc.OperationalError as e:
					self.log.error("Exception: %s!", e)

					db_sess.rollback()
				except sqlalchemy.exc.IntegrityError as e:
					self.log.error("Exception: %s!", e)
					db_sess.rollback()
					self.mon_con.incr('head-failed', 1)
					return False

				except Exception:
					self.log.error("Error when processing job response!")
					for line in traceback.format_exc().split("\n"):
						self.log.error(line)

					self.log.error("Contents of head response:")

					for line in pprint.pformat(new).split("\n"):
						self.log.error(line)
					self.mon_con.incr('head-failed', 1)
					return True

		self.mon_con.incr('head-failed', 1)
		return False
Exemple #4
0
	def process_single_avail(self):
		'''
		Example response:

		{
			'call': 'getHeadPhantomJS',
			'cancontinue': True,
			'dispatch_key': 'fetcher',
			'extradat': {'mode': 'fetch'},
			'jobid': -1,
			'jobmeta': {'sort_key': 'a269f164a16e11e6891500163ef6fe07'},
			'module': 'NUWebRequest',
			'ret': 'http://lightnovels.world/the-nine-godheads/nine-godheads-chapter-74/',
			'success': True,
			'user': '******',
			'user_uuid': 'urn:uuid:0a243518-834f-46d8-b34c-7f2afd20d37f'
		 }

		'''
		self.check_open_rpc_interface()

		while 1:
			try:
				new = self.rpc.get_job()
				break
			except TypeError:
				self.check_open_rpc_interface()
			except KeyError:
				self.check_open_rpc_interface()


		if new is None:
			self.log.info("No NU Head responses!")
			return False

		expected_keys = ['call', 'cancontinue', 'dispatch_key', 'extradat', 'jobid',
					'jobmeta', 'module', 'ret', 'success', 'user', 'user_uuid']
		while True:
			with db.session_context() as db_sess:
				try:
					assert all([key in new for key in expected_keys])

					assert 'referrer'    in new['extradat']
					assert 'wrapper_url' in new['extradat']

					if new['call'] == 'getHeadPhantomJS':
						respurl, title = new['ret'], ""
					elif new['call'] == 'getHeadTitlePhantomJS' or new['call'] == 'getHeadTitleChromium':
						if isinstance(new['ret'], (tuple, list)):
							respurl, title = new['ret']
						elif isinstance(new['ret'], dict):
							respurl = new['ret']['url']
							title   = new['ret']['title']
						else:
							raise RuntimeError("Don't know what the return type of `getHeadTitlePhantomJS` is! Type: %s" % type(new['ret']))

					else:
						raise RuntimeError("Response to unknown call: %s!" % new)

					if respurl.endswith("?m=1"):
						respurl = respurl[:-len("?m=1")]

					self.log.info("Processing remote head response: %s", new)
					self.log.info("Resolved job to URL: %s", respurl)
					self.log.info("Page title: %s", title)

					# Handle the 301/2 not resolving properly.
					netloc = urllib.parse.urlsplit(respurl).netloc
					if "novelupdates" in netloc:
						self.log.warning("Failed to validate external URL. Either scraper is blocked, or phantomjs is failing.")
						return True

					if 'm.wuxiaworld.com' in respurl:
						respurl = respurl.replace('m.wuxiaworld.com', 'www.wuxiaworld.com')
					if 'tseirptranslations.blogspot.com' in respurl:
						respurl = respurl.replace('tseirptranslations.blogspot.com', 'tseirptranslations.com')
					if 'm.xianxiaworld.net' in respurl:
						respurl = respurl.replace('m.xianxiaworld.net', 'www.xianxiaworld.net')
					if 'shikkakutranslations.wordpress.com' in respurl:
						respurl = respurl.replace('shikkakutranslations.wordpress.com', 'shikkakutranslations.com')

					if any([tmp in respurl for tmp in BAD_RESOLVES]):
						self.log.warning("Bad resolve in url: '%s'. Not inserting into DB.", respurl)
						return True

					if not respurl.lower().startswith("http"):
						self.log.warning("URL '%s' does not start with 'http'. Not inserting into DB.", respurl)
						return True

					if '/?utm_source=feedburner' in respurl:
						respurl = respurl.split('/?utm_source=feedburner')[0] + "/"

					have = db_sess.query(db.NuReleaseItem)                                    \
						.options(joinedload('resolved'))                                           \
						.filter(db.NuReleaseItem.outbound_wrapper==new['extradat']['wrapper_url']) \
						.filter(db.NuReleaseItem.referrer==new['extradat']['referrer'])            \
						.scalar()

					if not have:
						self.log.error("Base row deleted from resolve?")
						return

					if title.strip().lower() == respurl.strip().lower():
						self.log.warning("Item didn't resolve to a name properly!")
						return

					new = db.NuResolvedOutbound(
							client_id      = new['user'],
							client_key     = new['user_uuid'],
							actual_target  = respurl,
							resolved_title = title,
							fetched_on     = datetime.datetime.now(),
						)

					have.resolved.append(new)
					db_sess.commit()

					self.mon_con.incr('head-received', 1)
					return True
				except sqlalchemy.exc.InvalidRequestError:
					db_sess.rollback()
				except sqlalchemy.exc.OperationalError:
					db_sess.rollback()
				except sqlalchemy.exc.IntegrityError:
					db_sess.rollback()


				except Exception:
					self.mon_con.incr('head-failed', 1)
					self.log.error("Error when processing job response!")
					for line in traceback.format_exc().split("\n"):
						self.log.error(line)

					self.log.error("Contents of head response:")

					for line in pprint.pformat(new).split("\n"):
						self.log.error(line)
					return True
			return False
Exemple #5
0
    def process_single_avail(self):
        '''
		Example response:

		{
			'call': 'getHeadPhantomJS',
			'cancontinue': True,
			'dispatch_key': 'fetcher',
			'extradat': {'mode': 'fetch'},
			'jobid': -1,
			'jobmeta': {'sort_key': 'a269f164a16e11e6891500163ef6fe07'},
			'module': 'NUWebRequest',
			'ret': 'http://lightnovels.world/the-nine-godheads/nine-godheads-chapter-74/',
			'success': True,
			'user': '******',
			'user_uuid': 'urn:uuid:0a243518-834f-46d8-b34c-7f2afd20d37f'
		 }

		'''
        self.check_open_rpc_interface()

        errors = 0
        while 1:
            try:
                new = self.rpc.get_job()
                break
            except TypeError:
                self.check_open_rpc_interface()
            except KeyError:
                self.check_open_rpc_interface()
            except bsonrpc.exceptions.BsonRpcError as e:
                errors += 1
                self.check_open_rpc_interface()
                if errors > 3:
                    raise e
                else:
                    self.log.warning("Exception in RPC request:")
                    for line in traceback.format_exc().split("\n"):
                        self.log.warning(line)

        expected_keys = [
            'call', 'cancontinue', 'dispatch_key', 'extradat', 'jobid',
            'jobmeta', 'module', 'ret', 'success', 'user', 'user_uuid'
        ]
        if new is None:
            self.log.info("No NU Head responses!")
            return False
        while True:
            try:
                assert all([key in new for key in expected_keys])

                assert 'referrer' in new['extradat']
                assert 'wrapper_url' in new['extradat']

                if new['call'] == 'getHeadPhantomJS':
                    respurl, title = new['ret'], ""
                elif new['call'] == 'getHeadTitlePhantomJS' or new[
                        'call'] == 'getHeadTitleChromium':
                    if isinstance(new['ret'], (tuple, list)):
                        respurl, title = new['ret']
                    elif isinstance(new['ret'], dict):
                        respurl = new['ret']['url']
                        title = new['ret']['title']
                    else:
                        raise RuntimeError(
                            "Don't know what the return type of `getHeadTitlePhantomJS` is! Type: %s"
                            % type(new['ret']))

                else:
                    raise RuntimeError("Response to unknown call: %s!" % new)

                self.log.info("Processing remote head response: %s", new)
                self.log.info("Resolved job to URL: %s", respurl)
                self.log.info("Page title: %s", title)

                # Handle the 301/2 not resolving properly.
                netloc = urllib.parse.urlsplit(respurl).netloc
                if "novelupdates" in netloc:
                    self.log.warning(
                        "Failed to validate external URL. Either scraper is blocked, or phantomjs is failing."
                    )
                    return True


                have = self.db_sess.query(db.NuReleaseItem)                                    \
                 .options(joinedload('resolved'))                                           \
                 .filter(db.NuReleaseItem.outbound_wrapper==new['extradat']['wrapper_url']) \
                 .filter(db.NuReleaseItem.referrer==new['extradat']['referrer'])            \
                 .scalar()
                if not have:
                    self.log.error("Base row deleted from resolve?")
                    return

                new = db.NuResolvedOutbound(
                    client_id=new['user'],
                    client_key=new['user_uuid'],
                    actual_target=respurl,
                    resolved_title=title,
                    fetched_on=datetime.datetime.now(),
                )

                have.resolved.append(new)
                self.db_sess.commit()
                return True
            except sqlalchemy.exc.InvalidRequestError:
                self.db_sess.rollback()
            except sqlalchemy.exc.OperationalError:
                self.db_sess.rollback()
            except sqlalchemy.exc.IntegrityError:
                self.db_sess.rollback()

            except Exception:
                self.log.error("Error when processing job response!")
                for line in traceback.format_exc().split("\n"):
                    self.log.error(line)

                self.log.error("Contents of head response:")

                for line in pprint.pformat(new).split("\n"):
                    self.log.error(line)
                return True
        return False