def processFeedData(self, feedDat, tx_raw=True, tx_parse=True):

		if any([item in feedDat['linkUrl'] for item in skip_filter]):
			print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl'])
			return


		netloc = urllib.parse.urlparse(feedDat['linkUrl']).netloc

		nicename = feedNameLut.getNiceName(feedDat['linkUrl'])
		if not nicename:
			nicename = netloc
		feedDat['srcname'] = nicename

		# print("ProcessFeedData! ", netloc)
		if not WebMirror.rules.netloc_send_feed(netloc):
			print("Not sending data for netloc: ", netloc)
			return

		if tx_raw:
			raw = self.getRawFeedMessage(feedDat)
			if raw:
				self.amqp_put_item(raw)

		if tx_parse:
			new = self.getProcessedReleaseInfo(feedDat)
			if new:
				self.amqp_put_item(new)
Exemple #2
0
	def processFeedData(self, session, feedDat, tx_raw=True, tx_parse=True):


		if any([item in feedDat['linkUrl'] for item in common.global_constants.RSS_SKIP_FILTER]):
			print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl'])
			return

		if any([feedDat['title'].lower().startswith(item) for item in common.global_constants.RSS_TITLE_FILTER]):
			print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl'])
			return


		# print("Feed item title: ", feedDat['title'], feedDat)

		if feedDat['title'].lower().startswith("by: "):
			self.log.warning("Skipping due to title: '%s'", feedDat['title'])
			return


		netloc = urllib.parse.urlparse(feedDat['linkUrl']).netloc

		nicename = feedNameLut.getNiceName(session, feedDat['linkUrl'])
		if not nicename:
			nicename = netloc
		feedDat['srcname'] = nicename

		if should_ignore_feed_post(feedDat):
			self.log.warning("Skipping due to should_ignore_feed_post")
			return

		# print("ProcessFeedData! ", netloc)

		# A bunch of crap is aggregated through the "feedproxy.google.com" netloc.
		if "feedproxy.google.com" in netloc:
			print("Not sending data for feedproxy netloc: ", netloc)
			return

		try:
			new = self.getProcessedReleaseInfo(feedDat)
		except AssertionError:
			self.log.error("Exception when processing release!")
			for line in traceback.format_exc().split("\n"):
				self.log.error(line.rstrip())
			return

		if tx_parse:
			if new:
				self.log.info("Sending parsed release!")
				self.amqp_put_item(new)

		# A bunch of crap is aggregated through the "feedproxy.google.com" netloc.
		if not WebMirror.rules.netloc_send_feed(netloc):
			print("Not sending raw feed for netloc due to rules: ", netloc)
			return

		raw = self.getRawFeedMessage(feedDat)
		if tx_raw:
			if raw:
				self.amqp_put_item(raw)
Exemple #3
0
def missing_lut():
	import WebMirror.OutputFilters.util.feedNameLut as fnl
	rules = WebMirror.rules.load_rules()
	feeds = [item['feedurls'] for item in rules]
	feeds = [item for sublist in feeds for item in sublist]
	# feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds]
	for feed in feeds:
		if not fnl.getNiceName(feed):
			print("Missing: ", urllib.parse.urlsplit(feed).netloc)
Exemple #4
0
def missing_lut():
	import WebMirror.OutputFilters.util.feedNameLut as fnl
	rules = WebMirror.rules.load_rules()
	feeds = [item['feedurls'] for item in rules]
	feeds = [item for sublist in feeds for item in sublist]
	# feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds]
	for feed in feeds:
		if not fnl.getNiceName(feed):
			print("Missing: ", urllib.parse.urlsplit(feed).netloc)
	pass
def exposed_nu_new():
	'''
	Parse outbound netlocs from NovelUpdates releases, extracting
	any sites that are not known in the feednamelut.
	'''

	import WebMirror.OutputFilters.util.feedNameLut as fnl
	sess = db.get_db_session()

	nu_items = sess.query(db.NuOutboundWrapperMap)             \
		.filter(db.NuOutboundWrapperMap.validated == True)     \
		.filter(db.NuOutboundWrapperMap.actual_target != None) \
		.all()

	netlocs = [urllib.parse.urlsplit(row.actual_target).netloc for row in nu_items]
	print("Nu outbound items: ", len(netlocs))
	netlocs = set(netlocs)

	for netloc in netlocs:
		if not fnl.getNiceName(None, netloc):
			fnl.getNiceName(None, netloc, debug=True)
			print("Missing: ", netloc)
	print("Nu outbound items: ", len(netlocs))
def exposed_missing_lut():
	'''
	Iterate over distinct RSS feed sources in database,
	and print any for which there is not an entry in
	feedDataLut.py to the console.
	'''
	import WebMirror.OutputFilters.util.feedNameLut as fnl
	rules = WebMirror.rules.load_rules()
	feeds = [item['feedurls'] for item in rules]
	feeds = [item for sublist in feeds for item in sublist]
	# feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds]
	for feed in feeds:
		if not fnl.getNiceName(feed):
			print("Missing: ", urllib.parse.urlsplit(feed).netloc)
Exemple #7
0
	def get_dotted(self):
		self.fix_names()

		dotted_series = []
		dotted_authors = []

		with db.session_context() as db_sess:
			print("Counting items to load.")
			count = db_sess.query(db.NuReleaseItem)      \
				.filter(db.NuReleaseItem.reviewed == 'valid')        \
				.filter(db.NuReleaseItem.validated == True)       \
				.count()

			print("Loading")
			validated = db_sess.query(db.NuReleaseItem)      \
				.filter(db.NuReleaseItem.reviewed == 'valid')        \
				.filter(db.NuReleaseItem.validated == True)       \
				.yield_per(1000)
			validated = [tmp for tmp in validated]
			# validated = [tmp for tmp in tqdm.tqdm(validated, total=count)]

		print("Found %s releases" % len(validated))

		# for row in tqdm.tqdm(validated):
		for row in validated:
			if row.seriesname.endswith("..."):
				dotted_series.append((row.seriesname.strip(), row.actual_target))
			if row.groupinfo.endswith("..."):
				dotted_authors.append((row.groupinfo, row.actual_target))

		dseries = {}
		dauths  = {}




		with db.session_context() as db_sess:
			for name, url in dotted_series:
				nl = urllib.parse.urlparse(url).netloc
				nlname = feedNameLut.getNiceName(db_sess, url)
				if not nlname:
					nlname = nl
				dseries.setdefault(nlname, {})
				dseries[nlname][name] = url

			for name, url in dotted_authors:
				nl = urllib.parse.urlparse(url).netloc
				nlname = feedNameLut.getNiceName(db_sess, url)
				if not nlname:
					nlname = nl
				dauths.setdefault(nlname, {})
				dauths[nlname][name] = url

		self.log.info("Found %s dotted series, %s dotted authors", len(dseries), len(dauths))


		with open("dotted_nu_items.pyson", "w") as fp:
			out = pprint.pformat((dseries, dauths), indent=4)
			fp.write(out)

		return (dseries, dauths)