Python extractTitle Examples, WebMirror.OutputFilters.util.TitleParsers.extractTitle Python Examples

Example #1

0

Show file

File: FoxTellerReleasesFilter.py Project: rrosajp/ReadableWebProxy

    def extractSeriesReleases(self, row):

        tds = row.find_all("td")
        if len(tds) != 4:
            self.log.warning(
                "Row does not have four <td> tags! Don't know how to handle")
            pdtag = row.prettify()
            for line in pdtag.split("\n"):
                self.log.warning(line)

            return None
        title_td, ch_td, trans_td, release_td = tds

        title = title_td.find("div", class_='ellipsis-1').get_text(strip=True)

        author = trans_td.get_text(strip=True)

        if not title:
            return None
        if not author:
            return None

        # Cripes this is probably brittle
        series_type = "translated" if "," in author else "oel"

        reldate = float(release_td.span['data-timestamp'])

        chp_title = ch_td.get_text(strip=True)

        vol, chp, frag, _ = extractTitle(chp_title)

        raw_item = {}
        raw_item['srcname'] = 'FoxTeller'
        raw_item['published'] = reldate
        raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com",
                                                   ch_td.a['href'])

        raw_msg = msgpackers._buildReleaseMessage(
            raw_item=raw_item,
            series=title,
            vol=vol,
            chap=chp,
            frag=frag,
            # author      = author,
            postfix=chp_title,
            tl_type=series_type,
            # matchAuthor = True,
            # looseMatch  = True
        )

        msg = msgpackers.createReleasePacket(raw_msg)

        return msg

Example #2

0

Show file

File: TwitterFilter.py Project: fake-name/ReadableWebProxy

	def dispatchBT(self, itemurl, itemtxt):
		titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0]
		probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip()

		vol, chp, frag, post = extractTitle(titleonly)

		raw_item = {}
		raw_item['srcname']   = "Baka-Tsuki"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)

Example #3

0

Show file

File: TwitterFilter.py Project: fake-name/ReadableWebProxy

	def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
		itemtitle = NANO_DESU_MAP[netloc]
		vol, chp, frag, post = extractTitle(itemtxt)
		if not (vol or chp):
			return None

		raw_item = {}
		raw_item['srcname']   = "Nano Desu"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)
		return msg

Example #4

0

Show file

File: TwitterFilter.py Project: sikopet/ReadableWebProxy

    def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
        itemtitle = NANO_DESU_MAP[netloc]
        vol, chp, frag, post = extractTitle(itemtxt)
        if not (vol or chp):
            return None

        raw_item = {}
        raw_item['srcname'] = "Nano Desu"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             itemtitle,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)
        return msg

Example #5

0

Show file

File: TwitterFilter.py Project: sikopet/ReadableWebProxy

    def dispatchBT(self, itemurl, itemtxt):
        titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split(
            "BY")[0]
        probSeries = titleonly.lower().split("volume")[0].split(
            "chapter")[0].strip()

        vol, chp, frag, post = extractTitle(titleonly)

        raw_item = {}
        raw_item['srcname'] = "Baka-Tsuki"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             probSeries,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)

Example #6

0

Show file

File: RRLSeriesPageFilter.py Project: fake-name/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):

		match = self.match_re.search(seriesPageUrl)
		series_id = match.group(1)
		conf = load_lut()

		assert 'force_sequential_numbering' in conf

		must_renumber = series_id in conf['force_sequential_numbering']


		# print("")
		# print("Match: ", match, match.groups(), series_id)
		# print("series_id", series_id)
		# print("Renumber:", must_renumber)


		header   = soup.find("div", class_='fic-title')
		titletg  = header.find("h2")
		authortg = header.find("h4")
		authortg.find("span").decompose()

		ratingtg_type_1 = soup.find("div", class_='rating')
		ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score'))


		if ratingtg_type_1:
			startg = ratingtg_type_1.find("span", class_='star')
		elif ratingtg_type_2:
			# print(ratingtg_type_2)
			starcontainer = ratingtg_type_2.find_next_sibling("li")
			if not starcontainer:
				self.log.error("Could not find rating tag (starcontainer)!")
				return []
			startg = starcontainer.find("span", class_='star')
			if not startg:
				self.log.error("Could not find rating tag (startg)!")
				return []


		else:
			self.log.error("Could not find rating tag!")
			return []

		ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)]
		rating = ratingcls[0].split("-")[-1]

		rating = float(rating) / 10
		rating = rating * 2  # Normalize to 1-10 scale
		# print(startg['class'])
		if not ratingcls:
			return []

		if not rating >= MIN_RATING and rating != 0.0:
			self.log.error("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.error("Could not find title tag!")
			return []
		if not authortg:
			self.log.error("Could not find author tag!")
			return []


		title  = titletg.get_text().strip()
		author = authortg.get_text().strip()



		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		if not descDiv or not descDiv.div:
			self.log.error("Incomplete or broken description?")
			return []

		desc = []
		for segment in descDiv.div:
			if isinstance(segment, bs4.NavigableString):
				desc.append(str(segment).strip())
			else:
				if segment.get_text().strip():
					desc.append(segment.get_text().strip())

		desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
		# print(desc)

		tags = []
		tagdiv = soup.find('div', class_='tags')
		for tag in tagdiv.find_all('span', class_='label'):
			tagtxt = tag.get_text().strip().lower().replace(" ", "-")
			# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
			if tagtxt in conf['tag_rename']:
				tagtxt = conf['tag_rename'][tagtxt]
			tags.append(tagtxt)

		info_div = soup.find("div", class_='fiction-info')
		warning_div = info_div.find("div", class_='font-red-sunglo')
		if warning_div:
			for warning_tag in warning_div.find_all('li'):
				tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-")
				# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
				if tagtxt in conf['tag_rename']:
					tagtxt = conf['tag_rename'][tagtxt]
				tags.append(tagtxt)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = "\r\n".join(desc)
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'
		seriesmeta['create_tags'] = True


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)
		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find_all("tr", attrs={"data-url" : True})

		raw_retval = []
		for chapter in chapters:
			if len(chapter.find_all("td")) != 2:
				self.log.warning("Row with invalid number of entries?")
				continue
			cname, cdate = chapter.find_all("td")

			reldate = cdate.time['unixtime']
			relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl)


			chp_title = cname.get_text().strip()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = float(reldate)
			raw_item['linkUrl']   = relurl

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
				if must_renumber:
					self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.")
				else:
					self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval

Example #7

0

Show file

File: NUSeriesPageFilter.py Project: MyAnimeDays/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h4", class_='seriestitle')
		altnametg  = soup.find("div", id='editassociated')
		descrtg  = soup.find("div", id='editdescription')



		link_sets = {
			'authortg'        : soup.find("div", id='showauthors'),
			'artisttg'        : soup.find("div", id='showartists'),
			'langtg'          : soup.find("div", id='showlang'),
			'genretg'         : soup.find("div", id='seriesgenre'),
			'tagstg'          : soup.find("div", id='showtags'),
			'typetg'          : soup.find("div", id='showtype'),
			'orig_pub_tg'     : soup.find("div", id='showopublisher'),
			'eng_pub_tg'      : soup.find("div", id='showepublisher'),
		}

		text_sets = {
			'transcompletetg' : soup.find("div", id='showtranslated'),
			'yeartg'          : soup.find("div", id='edityear'),
			'coostatustg'     : soup.find("div", id='editstatus'),
			'licensedtg'      : soup.find("div", id='showlicensed'),
			}

		if not titletg:
			self.log.warn("Could not find item title!")
			return []
		if not altnametg:
			self.log.warn("Could not find alt-name container tag!")
			return []
		if not descrtg:
			self.log.warn("Could not find description container tag!")
			return []

		data_sets = {}
		for key in list(link_sets.keys()):
			if not link_sets[key]:
				self.log.warn("Could not find tag for name: '%s'", key)
				return []
			data_sets[key] = [tag.get_text() for tag in link_sets[key].find_all("a")]

		for key in list(text_sets.keys()):
			if not text_sets[key]:
				self.log.warn("Could not find tag for name: '%s'", key)
				return []
			data_sets[key] = [tmp.strip() for tmp in text_sets[key].contents if isinstance(tmp, bs4.NavigableString)]

		title  = titletg.get_text().strip()

		data_sets['title'] = title
		data_sets['altnames'] = [tmp.strip() for tmp in altnametg.contents if isinstance(tmp, bs4.NavigableString)]

		# Scrub incoming markup
		for key in list(data_sets.keys()):
			if isinstance(data_sets[key], list):
				data_sets[key] = [bleach.clean(val, tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() for val in data_sets[key]]
			else:
				data_sets[key] = bleach.clean(data_sets[key], tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip()


		if data_sets['yeartg'] and data_sets['yeartg'][0]:
			print("Non-null data_sets['yeartg']:", data_sets['yeartg'])
			tmp_d = datetime.datetime(year=int(data_sets['yeartg'].pop()), month=1, day=1)
			data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple())
		else:
			data_sets['yeartg'] = None

		{
			# 'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'],
			# 'orig_pub_tg': ['Media Factory'],
			# 'eng_pub_tg': [],
			# 'typetg': ['Web Novel'],
			# 'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'],
			# 'licensedtg': ['No'],
			# 'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'],
			# 'authortg': ['Iida K'],
			# 'artisttg': ['Shimotsuki Eito'],
			# 'title': 'Sendai Yuusha wa Inkyou Shitai',
			# 'description': '<p>\n  Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n  However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>',
			# 'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'],
			# 'langtg': ['Japanese'],
			# 'yeartg': ['2013']


			'transcompletetg': ['No'],
		}

		data_sets['description'] = bleach.clean(descrtg.prettify(), tags=['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'p'], strip=True).strip()

		series_message = {
			'update_only'   : False,
			'sourcesite'    : "NovelUpdates",
			'title'         : data_sets['title'],
			'alt_titles'    : data_sets['altnames'] + [data_sets['title'], ],

			'desc'          : data_sets['description'],
			# 'homepage'      : data_sets[''],
			'author'        : data_sets['authortg'],
			'illust'        : data_sets['artisttg'],

			'pubdate'       : data_sets['yeartg'],
			'pubnames'      : data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'],
			# 'sourcesite'    : data_sets[''],
			'tags'          : data_sets['tagstg'],

			# AFICT, NovelUpdates doesn't have any english items, but wth.
			'tl_type'       : "translated" if 'English' not in data_sets['langtg'] else "oel",

			# New:
			'coostate'      : data_sets['coostatustg'],
			'type'          : data_sets['typetg'],
			'genres'        : data_sets['genretg'],
			'licensed'      : data_sets['licensedtg'],
			'transcomplete' : data_sets['transcompletetg'],

		}

		pkt = msgpackers.createSeriesInfoPacket(series_message, matchAuthor=True, beta=self.is_beta)
		# print(pkt)

		extra = {}
		extra['tags']     = data_sets['tagstg']
		# extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'Unknown'


		chapter_tbl = soup.find("table", class_='tablesorter')
		releases = chapter_tbl.find_all("tr")

		retval = []
		for release in releases:

			items = release.find_all("td")
			if len(items) == 0:
				continue

			date_tg, group_tg, chp_tg = items

			rel = datetime.datetime.strptime(date_tg.get_text(), '%m/%d/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title  = chp_tg.get_text().strip()
			group_name = group_tg.get_text().strip()
			vol, chp, frag, post = extractTitle(chp_title)

			raw_item = {}
			raw_item['srcname']   = msgpackers.fixSmartQuotes(group_name)
			raw_item['published'] = reldate
			raw_item['linkUrl']   = chp_tg.a['href']

			msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=data_sets['authortg'], postfix=chp_title, tl_type='translated', extraData=extra, matchAuthor=True)
			retval.append(msg)

		missing_chap = 0
		for item in retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(retval):
			unnumbered = (missing_chap/len(retval)) * 100
			if len(retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# # Do not add series without 3 chapters.
		# if len(retval) < 3:
		# 	self.log.info("Less then three chapters!")
		# 	return []

		if not retval:
			self.log.info("Retval empty?!")
			return []
		self.amqp_put_item(pkt)
		# return []
		return retval

Example #8

0

Show file

    def extractSeriesReleases(self, seriesPageUrl, soup):
        title = soup.find("div", class_='fanfic_title_div').get_text()
        author = soup.find("div", class_='fanfic_author_div').get_text()
        ratingtg = soup.find("div", class_='fanfic_title_wrapper')
        ratingtg = [
            item for item in ratingtg.contents if "Rating" in str(item)
        ]
        if not ratingtg:
            ratingtg = ''
        else:
            ratingtg = ratingtg.pop()

        rating, views, chapters = ratingtg.split("·")

        # I think the japtem rating system is just plain out broken.
        if not "no rating" in ratingtg.lower():
            rating_score = float(rating.split()[-1])
            if not rating_score >= MIN_RATING:
                return []

        chapter_num = float(chapters.split()[0])
        if chapter_num < 3:
            return []

        if not title:
            return []
        if not author:
            return []

        descDiv = soup.find('div', class_='fanfic_synopsis')

        if not descDiv:
            print(soup)

        paras = descDiv.find_all("p")
        tags = []

        desc = []
        for para, text in [(para, para.get_text()) for para in paras]:
            if text.lower().startswith('categories:'):
                tagstr = text.split(":", 1)[-1]
                items = tagstr.split(",")
                [tags.append(item.strip()) for item in items if item.strip()]
            else:
                desc.append(para)

        seriesmeta = {}

        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = ''
        seriesmeta['desc'] = " ".join([str(para) for para in desc])
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'JapTem Fanfic'

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = ''
        extra['sourcesite'] = 'JapTem Fanfic'

        retval = []

        chapters = soup.find("ul", class_='fanfic_chapter_list')
        volumes = chapters.find_all('li', class_='fanfic_volume')
        for volume in volumes:
            releases = volume.find_all('li', class_='fanfic_chapter')
            for release in releases:
                chp_title = release.find("a")

                vol_str = volume.find('div',
                                      class_='fanfic_volume_title').get_text()
                reldate = time.time()

                chp_title = chp_title.get_text()

                agg_title = " ".join((vol_str, chp_title))
                vol, chp, frag, post = extractTitle(agg_title)

                raw_item = {}
                raw_item['srcname'] = 'JapTem Fanfic'
                raw_item['published'] = reldate
                releaseurl = urllib.parse.urljoin(seriesPageUrl,
                                                  release.a['href'])
                raw_item['linkUrl'] = releaseurl

                raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                         title,
                                                         vol,
                                                         chp,
                                                         frag,
                                                         author=author,
                                                         postfix=chp_title,
                                                         tl_type='oel',
                                                         extraData=extra,
                                                         matchAuthor=True)
                msg = msgpackers.createReleasePacket(raw_msg)

                retval.append(msg)

        if not retval:
            return []

        self.amqp_put_item(meta_pkt)
        return retval

Example #9

0

Show file

File: RRLSeriesPageFilter.py Project: sikopet/ReadableWebProxy

    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)
        conf = load_lut()

        assert 'force_sequential_numbering' in conf

        must_renumber = series_id in conf['force_sequential_numbering']

        # print("")
        # print("Match: ", match, match.groups(), series_id)
        # print("series_id", series_id)
        # print("Renumber:", must_renumber)

        header = soup.find("div", class_='fic-title')
        titletg = header.find("h1")
        authortg = header.find("h4")
        authortg.find("span").decompose()

        rating_val = soup.find("meta", property='books:rating:value')
        rating_scale = soup.find("meta", property='books:rating:scale')

        print("Rating value:", rating_val)
        print("Rating scale:", rating_scale)

        if not rating_val or not rating_scale:
            return []

        rval_f = float(rating_val.get('content', "0"))
        rscale_f = float(rating_scale.get('content', "999999"))

        rating = 5 * (rval_f / rscale_f)

        print("Float rating: ", rating)

        if not rating >= MIN_RATING and rating != 0.0:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if not titletg:
            self.log.error("Could not find title tag!")
            return []
        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='description')
        if not descDiv or not descDiv.div:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv.div:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
        # print(desc)

        tags = []
        tagdiv = soup.find('span', class_='tags')
        for tag in tagdiv.find_all('span', class_='label'):
            tagtxt = tag.get_text().strip().lower().replace(" ", "-")
            # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
            if tagtxt in conf['tag_rename']:
                tagtxt = conf['tag_rename'][tagtxt]
            tags.append(tagtxt)

        info_div = soup.find("div", class_='fiction-info')
        warning_div = info_div.find("div", class_='font-red-sunglo')
        if warning_div:
            for warning_tag in warning_div.find_all('li'):
                tagtxt = warning_tag.get_text().strip().lower().replace(
                    " ", "-")
                # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
                if tagtxt in conf['tag_rename']:
                    tagtxt = conf['tag_rename'][tagtxt]
                tags.append(tagtxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        chapters = soup.find_all("tr", attrs={"data-url": True})

        raw_retval = []
        for chapter in chapters:
            if len(chapter.find_all("td")) != 2:
                self.log.warning("Row with invalid number of entries?")
                continue
            cname, cdate = chapter.find_all("td")

            reldate = cdate.time['unixtime']
            relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, post = extractTitle(chp_title + " " + title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                     title,
                                                     vol,
                                                     chp,
                                                     frag,
                                                     author=author,
                                                     postfix=chp_title,
                                                     tl_type='oel',
                                                     extraData=extra,
                                                     matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        missing_chap = 0
        for item in raw_retval:
            if not (item['vol'] or item['chp']):
                missing_chap += 1

        if len(raw_retval):
            unnumbered = (missing_chap / len(raw_retval)) * 100
            if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
                if must_renumber:
                    self.log.warning(
                        "Item numbering force-overridden! Adding simple sequential chapter numbers."
                    )
                else:
                    self.log.warning(
                        "Item seems to not have numbered chapters. Adding simple sequential chapter numbers."
                    )
                chap = 1
                for item in raw_retval:
                    item['vol'] = None
                    item['chp'] = chap
                    chap += 1

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        # self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        return retval

Example #10

0

Show file

File: JapTemSeriesPageFilter.py Project: MyAnimeDays/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):
		title  = soup.find("div", class_='fanfic_title_div').get_text()
		author = soup.find("div", class_='fanfic_author_div').get_text()
		ratingtg = soup.find("div", class_='fanfic_title_wrapper')
		ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)]
		if not ratingtg:
			ratingtg = ''
		else:
			ratingtg = ratingtg.pop()


		rating, views, chapters = ratingtg.split("·")

		# I think the japtem rating system is just plain out broken.
		if not "no rating" in ratingtg.lower():
			rating_score = float(rating.split()[-1])
			if not rating_score >= MIN_RATING:
				return []


		chapter_num = float(chapters.split()[0])
		if chapter_num < 3:
			return []



		if not title:
			return []
		if not author:
			return []


		descDiv = soup.find('div', class_='fanfic_synopsis')

		if not descDiv:
			print(soup)

		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = ''
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'JapTem'


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = ''
		extra['sourcesite']  = 'JapTem'

		retval = []

		chapters = soup.find("ul", class_='fanfic_chapter_list')
		volumes = chapters.find_all('li', class_='fanfic_volume')
		for volume in volumes:
			releases = volume.find_all('li', class_='fanfic_chapter')
			for release in releases:
				chp_title = release.find("a")

				vol_str = volume.find('div', class_='fanfic_volume_title').get_text()
				reldate = time.time()

				chp_title = chp_title.get_text()

				agg_title = " ".join((vol_str, chp_title))
				# print("Chp title: '{}'".format(chp_title))
				vol, chp, frag, post = extractTitle(agg_title)
				raw_item = {}
				raw_item['srcname']   = "JapTem"
				raw_item['published'] = reldate
				releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href'])
				raw_item['linkUrl']   = releaseurl

				msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra)
				msg = msgpackers.createReleasePacket(msg)

				retval.append(msg)
		if not retval:
			return []

		retval.append(meta_pkt)
		# return []
		return retval

Example #11

0

Show file

File: RRLSeriesPageFilter.py Project: bradparks/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			self.log.info("Could not find rating tag!")
			return []


		rating = float(ratingtg['score'])
		if not rating >= MIN_RATING and rating != 0.0:
			self.log.info("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.info("Could not find title tag!")
			return []
		if not authortg:
			self.log.info("Could not find author tag!")
			return []


		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		raw_retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if len(raw_retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval

Example #12

0

Show file

File: RRLSeriesPageFilter.py Project: GodOfConquest/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			return []

		if not float(ratingtg['score']) >= MIN_RATING:
			return []

		if not titletg:
			return []
		if not authortg:
			return []
		if not ratingtg:
			return []

		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		pkt = msgpackers.sendSeriesInfoPacket(seriesmeta)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra)
			retval.append(msg)

		# Do not add series without 3 chapters.
		if len(retval) < 3:
			return []

		if not retval:
			return []
		self.amqp_put_item(pkt)
		return retval

Example #13

0

Show file

File: RRLSeriesPageFilter.py Project: MyAnimeDays/ReadableWebProxy

	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			self.log.info("Could not find rating tag!")
			return []


		rating = float(ratingtg['score'])
		if not rating >= MIN_RATING and rating != 0.0:
			self.log.info("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.info("Could not find title tag!")
			return []
		if not authortg:
			self.log.info("Could not find author tag!")
			return []


		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)
			retval.append(msg)

		missing_chap = 0
		for item in retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(retval):
			unnumbered = (missing_chap/len(retval)) * 100
			if len(retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(retval) < 3:
			self.log.info("Less then three chapters!")
			return []



		if not retval:
			self.log.info("Retval empty?!")
			return []
		self.amqp_put_item(pkt)
		return retval