def extractSeriesReleases(self, seriesPageUrl, soup):
		chapter_divs = soup.find_all("a", class_='chapter-link')
		retval = []

		for linka in chapter_divs:
			state   = linka['data-preprocessor-state']
			vol     = linka['data-preprocessor-vol']
			chp     = linka['data-preprocessor-chp']
			name    = linka['data-preprocessor-name']
			index   = linka['data-preprocessor-index']
			title   = linka['data-preprocessor-title']
			reldate = linka['data-preprocessor-reldate']
			href    = linka['href']



			itemDate, status = parsedatetime.Calendar().parse(reldate)

			if status < 1:
				continue

			reldate = time.mktime(itemDate)

			relurl = common.util.urlFuncs.rebaseUrl(linka['href'] + "/", seriesPageUrl)


			print([vol, chp, state, linka])

			raw_item = {}
			raw_item['srcname']   = "Qidian"
			raw_item['published'] = float(reldate)
			raw_item['linkUrl']   = relurl

			if state == '0':
				raw_msg = msgpackers.buildReleaseMessageWithType(raw_item, title, None, index, None, tl_type='translated', prefixMatch=True)
				retval.append(msgpackers.serialize_message(raw_msg))
			elif state == "2":
				raw_msg = msgpackers.buildReleaseDeleteMessageWithType(raw_item, title, None, index, None, tl_type='translated', prefixMatch=True)
				retval.append(msgpackers.serialize_message(raw_msg))
			else:
				print("Unknown state:", state)

		# Do not add series without 3 chapters.
		if len(retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		# if not retval:
		# 	self.log.info("Retval empty?!")
		# 	return []

		# return []

		return retval
    def extractSeriesReleases(self, row):

        tds = row.find_all("td")
        if len(tds) != 4:
            self.log.warning(
                "Row does not have four <td> tags! Don't know how to handle")
            pdtag = row.prettify()
            for line in pdtag.split("\n"):
                self.log.warning(line)

            return None
        title_td, ch_td, trans_td, release_td = tds

        title = title_td.find("div", class_='ellipsis-1').get_text(strip=True)

        author = trans_td.get_text(strip=True)

        if not title:
            return None
        if not author:
            return None

        # Cripes this is probably brittle
        series_type = "translated" if "," in author else "oel"

        reldate = float(release_td.span['data-timestamp'])

        chp_title = ch_td.get_text(strip=True)

        vol, chp, frag, _ = extractTitle(chp_title)

        raw_item = {}
        raw_item['srcname'] = 'FoxTeller'
        raw_item['published'] = reldate
        raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com",
                                                   ch_td.a['href'])

        raw_msg = msgpackers._buildReleaseMessage(
            raw_item=raw_item,
            series=title,
            vol=vol,
            chap=chp,
            frag=frag,
            # author      = author,
            postfix=chp_title,
            tl_type=series_type,
            # matchAuthor = True,
            # looseMatch  = True
        )

        msg = msgpackers.createReleasePacket(raw_msg)

        return msg
Ejemplo n.º 3
0
	def dispatchBT(self, itemurl, itemtxt):
		titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0]
		probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip()

		vol, chp, frag, post = extractTitle(titleonly)

		raw_item = {}
		raw_item['srcname']   = "Baka-Tsuki"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)
Ejemplo n.º 4
0
	def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
		itemtitle = NANO_DESU_MAP[netloc]
		vol, chp, frag, post = extractTitle(itemtxt)
		if not (vol or chp):
			return None

		raw_item = {}
		raw_item['srcname']   = "Nano Desu"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)
		return msg
Ejemplo n.º 5
0
 def sendReleases(self, releases):
     self.log.info(
         "Total releases found on page: %s. Emitting messages into AMQP local queue.",
         len(releases))
     for release in releases:
         pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
         self.amqp_put_item(pkt)
Ejemplo n.º 6
0
    def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
        itemtitle = NANO_DESU_MAP[netloc]
        vol, chp, frag, post = extractTitle(itemtxt)
        if not (vol or chp):
            return None

        raw_item = {}
        raw_item['srcname'] = "Nano Desu"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             itemtitle,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)
        return msg
Ejemplo n.º 7
0
    def dispatchBT(self, itemurl, itemtxt):
        titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split(
            "BY")[0]
        probSeries = titleonly.lower().split("volume")[0].split(
            "chapter")[0].strip()

        vol, chp, frag, post = extractTitle(titleonly)

        raw_item = {}
        raw_item['srcname'] = "Baka-Tsuki"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             probSeries,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)
Ejemplo n.º 8
0
    def extractSeries(self, seriesPageUrl, soup):

        itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup)
        itemdata = self.extractSeriesInfo(itemsoup)
        # print(itemdata)

        tags = []
        if 'genre' in itemdata and itemdata['genre']:
            tags = list(
                set([
                    item.lower().strip().replace("  ", " ").replace(" ", "-")
                    for item in itemdata['genre']
                ]))

        seriesmeta = {}

        seriesmeta['title'] = itemdata['title']
        seriesmeta['alt_titles'] = [
            itemdata['jTitle'],
        ] + itemdata['alt_names']

        seriesmeta['author'] = itemdata['author']
        seriesmeta['illust'] = itemdata['illust']
        seriesmeta['desc'] = itemdata['description']
        if itemdata['pubdate']:
            seriesmeta['pubdate'] = calendar.timegm(
                itemdata['pubdate'].timetuple())
        else:
            seriesmeta['pubdate'] = None
        seriesmeta['pubnames'] = itemdata['pubnames']

        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = None

        seriesmeta['tl_type'] = 'translated'
        seriesmeta['sourcesite'] = 'LNDB'

        # pprint.pprint(itemdata)
        # pprint.pprint(seriesmeta)

        # print(seriesmeta)
        pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                beta=IS_BETA,
                                                matchAuthor=True)
        self.amqp_put_item(pkt)
	def extractSeries(self, seriesPageUrl, soup):

		itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup)
		itemdata = self.extractSeriesInfo(itemsoup)
		# print(itemdata)

		tags = []
		if 'genre' in itemdata and itemdata['genre']:
			tags = list(set([item.lower().strip().replace("  ", " ").replace(" ", "-") for item in itemdata['genre']]))

		seriesmeta = {}


		seriesmeta['title']       = itemdata['title']
		seriesmeta['alt_titles']  = [itemdata['jTitle'], ] + itemdata['alt_names']


		seriesmeta['author']      = itemdata['author']
		seriesmeta['illust']      = itemdata['illust']
		seriesmeta['desc']        = itemdata['description']
		if itemdata['pubdate']:
			seriesmeta['pubdate']     = calendar.timegm(itemdata['pubdate'].timetuple())
		else:
			seriesmeta['pubdate']     = None
		seriesmeta['pubnames']    = itemdata['pubnames']


		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = None

		seriesmeta['tl_type']     = 'translated'
		seriesmeta['sourcesite']  = 'LNDB'


		# pprint.pprint(itemdata)
		# pprint.pprint(seriesmeta)

		# print(seriesmeta)
		pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True)
		self.amqp_put_item(pkt)
    def extractSeries(self, seriesPageUrl, soup):

        itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup)
        itemdata = self.extractSeriesInfo(itemsoup)
        # print(itemdata)

        tags = []
        if "genre" in itemdata and itemdata["genre"]:
            tags = list(set([item.lower().strip().replace("  ", " ").replace(" ", "-") for item in itemdata["genre"]]))

        seriesmeta = {}

        seriesmeta["title"] = itemdata["title"]
        seriesmeta["alt_titles"] = [itemdata["jTitle"]] + itemdata["alt_names"]

        seriesmeta["author"] = itemdata["author"]
        seriesmeta["illust"] = itemdata["illust"]
        seriesmeta["desc"] = itemdata["description"]
        if itemdata["pubdate"]:
            seriesmeta["pubdate"] = calendar.timegm(itemdata["pubdate"].timetuple())
        else:
            seriesmeta["pubdate"] = None
        seriesmeta["pubnames"] = itemdata["pubnames"]

        seriesmeta["tags"] = tags
        seriesmeta["homepage"] = None

        seriesmeta["tl_type"] = "translated"
        seriesmeta["sourcesite"] = "LNDB"

        # pprint.pprint(itemdata)
        # pprint.pprint(seriesmeta)

        # print(seriesmeta)
        pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True)
        self.amqp_put_item(pkt)
	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h4", class_='seriestitle')
		if not titletg:
			titletg  = soup.find("div", class_='seriestitlenu')
		altnametg  = soup.find("div", id='editassociated')
		descrtg  = soup.find("div", id='editdescription')

		link_sets = {
			'authortg'        : soup.find("div", id='showauthors'),
			'artisttg'        : soup.find("div", id='showartists'),
			'langtg'          : soup.find("div", id='showlang'),
			'genretg'         : soup.find("div", id='seriesgenre'),
			'tagstg'          : soup.find("div", id='showtags'),
			'typetg'          : soup.find("div", id='showtype'),
			'orig_pub_tg'     : soup.find("div", id='showopublisher'),
			'eng_pub_tg'      : soup.find("div", id='showepublisher'),
		}

		text_sets = {
			'transcompletetg' : soup.find("div", id='showtranslated'),
			'yeartg'          : soup.find("div", id='edityear'),
			'coostatustg'     : soup.find("div", id='editstatus'),
			'licensedtg'      : soup.find("div", id='showlicensed'),
			}

		if not titletg:
			self.log.warn("Could not find item title!")
			print(soup)
			return []
		if not altnametg:
			self.log.warn("Could not find alt-name container tag!")
			return []
		if not descrtg:
			self.log.warn("Could not find description container tag!")
			return []

		data_sets = {}
		for key in list(link_sets.keys()):
			if not link_sets[key]:
				self.log.warn("Could not find tag for name: '%s'", key)
				return []
			data_sets[key] = [tag.get_text() for tag in link_sets[key].find_all("a")]

		for key in list(text_sets.keys()):
			if not text_sets[key]:
				self.log.warn("Could not find tag for name: '%s'", key)
				return []
			data_sets[key] = [tmp.strip() for tmp in text_sets[key].contents if isinstance(tmp, bs4.NavigableString)]

		title  = titletg.get_text().strip()

		data_sets['title'] = title
		data_sets['altnames'] = [tmp.strip() for tmp in altnametg.contents if isinstance(tmp, bs4.NavigableString)]

		# Scrub incoming markup
		for key in list(data_sets.keys()):
			if isinstance(data_sets[key], list):
				data_sets[key] = [bleach.clean(val, tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() for val in data_sets[key]]
			else:
				data_sets[key] = bleach.clean(data_sets[key], tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip()


		if data_sets['yeartg'] and data_sets['yeartg'][0]:
			# print("Non-null data_sets['yeartg']:", data_sets['yeartg'])
			tmp_d = datetime.datetime(year=int(data_sets['yeartg'].pop()), month=1, day=1)
			data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple())
		else:
			data_sets['yeartg'] = None

		# {
		# 	'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'],
		# 	'orig_pub_tg': ['Media Factory'],
		# 	'eng_pub_tg': [],
		# 	'typetg': ['Web Novel'],
		# 	'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'],
		# 	'licensedtg': ['No'],
		# 	'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'],
		# 	'authortg': ['Iida K'],
		# 	'artisttg': ['Shimotsuki Eito'],
		# 	'title': 'Sendai Yuusha wa Inkyou Shitai',
		# 	'description': '<p>\n  Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n  However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>',
		# 	'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'],
		# 	'langtg': ['Japanese'],
		# 	'yeartg': ['2013']

		# 	'transcompletetg': ['No'],
		# }

		data_sets['description'] = bleach.clean(descrtg.prettify(), tags=['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'p'], strip=True).strip()

		series_message = {
			'update_only'   : False,
			'sourcesite'    : "NovelUpdates",
			'title'         : data_sets['title'],
			'alt_titles'    : data_sets['altnames'] + [data_sets['title'], ],

			'desc'          : data_sets['description'],
			# 'homepage'      : data_sets[''],
			'author'        : data_sets['authortg'],
			'illust'        : data_sets['artisttg'],

			'pubdate'       : data_sets['yeartg'],
			'pubnames'      : data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'],
			# 'sourcesite'    : data_sets[''],
			'tags'          : data_sets['tagstg'],

			# AFICT, NovelUpdates doesn't have any english items, but wth.
			'tl_type'       : "translated" if 'English' not in data_sets['langtg'] else "oel",

			# New:
			'coostate'      : data_sets['coostatustg'],
			'type'          : data_sets['typetg'],
			'genres'        : data_sets['genretg'],
			'licensed'      : data_sets['licensedtg'],
			'transcomplete' : data_sets['transcompletetg'],

			'create_tags'   : True,
		}
		# pprint.pprint(series_message)
		series_info_packet = msgpackers.createSeriesInfoPacket(series_message, matchAuthor=True, beta=self.is_beta)
		# print(series_info_packet)

		extra = {}
		extra['tags']     = data_sets['tagstg']
		# extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'Unknown'


		chapter_tbl = soup.find("table", id='myTable')
		if not chapter_tbl:
			self.log.error("No chapter table!")
			return

		releases = chapter_tbl.find_all("tr")

		valid_releases = 0
		for release in releases:

			items = release.find_all("td")
			if len(items) != 3:
				continue

			date_tg, group_tg, chp_tg = items

			rel = datetime.datetime.strptime(date_tg.get_text().strip(), '%m/%d/%y')
			if rel.date() == datetime.date.today():
				reldate = datetime.datetime.now()
			else:
				reldate = datetime.datetime.fromtimestamp(calendar.timegm(rel.timetuple()))

			release_info  = chp_tg.get_text().strip()
			group_name = group_tg.get_text().strip()
			group_name = msgpackers.fixSmartQuotes(group_name)


			upsertNuItem(self.raw_cur,
				{
					'seriesname'       : title,
					'releaseinfo'      : release_info,
					'groupinfo'        : group_name,
					'referrer'         : seriesPageUrl,
					'outbound_wrapper' : chp_tg.a['href'],
					'first_seen'       : reldate,
				})


			valid_releases += 1


		self.log.info("Committing!")
		self.raw_cur.execute("COMMIT;")
		self.log.info("Committed!")
		# Do not add series without 3 chapters.
		if valid_releases < 3:
			self.log.warning("Less then three chapters!")
			return

		self.amqp_put_item(series_info_packet)
		return
Ejemplo n.º 12
0
 def sendReleases(self, releases):
     self.log.info("Total releases found on page: %s", len(releases))
     for release in releases:
         pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
         self.amqp_put_item(pkt)
Ejemplo n.º 13
0
    def extractSeriesReleases(self, seriesPageUrl, metadata, soup):

        title = metadata['title']
        author = metadata['user']['name']
        desc = metadata['description']
        tags = metadata['tags']

        # Apparently the description is rendered in a <pre> tag.
        # Huh?
        desc = markdown.markdown(desc, extensions=["mdx_linkify"])

        title = title.strip()

        # Siiiiiigh. Really?
        title = title.replace("[#wattys2015]", "")
        title = title.replace("(Wattys2015) ", "")
        title = title.replace("#Wattys2015", "")
        title = title.replace("Wattys2015", "")
        title = title.strip()

        if metadata['numParts'] < 3:
            return []
        if metadata['voteCount'] < 100:
            return []

        # Language ID 1 is english.
        if metadata['language']['id'] != 1:
            return []

        # Allow blocking of item by ID
        if metadata['id'] in BLOCK_IDS:
            return []

        # for some particularly stupid reasons, the item category tag is
        # not included in the metadata.
        # therefore, we parse it out from the page manually.
        tagdiv = soup.find("div", class_="tags")
        if tagdiv:
            for tag in tagdiv.find_all("a", class_='tag'):
                tags.append(tag.get_text())

        tags = list(
            set([
                item.lower().strip().replace("  ", " ").replace(" ", "-")
                for item in tags
            ]))

        # Mask any content with any of the blocked tags.
        if any([item in tags for item in WATTPAD_MASKED_TAGS]):
            self.log.warning(
                "Item has a masked tag. Not emitting any releases.")
            self.log.warning("Tags: '%s'", tags)
            return

        # And check that at least one of the target tags is present.
        if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]):
            self.log.warning(
                "Item missing required tag. Not emitting any releases.")
            self.log.warning("Tags: '%s'", tags)
            return

        seriesmeta = {}

        extra = {}
        extra['tags'] = tags[:]
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'WattPad'

        retval = []
        index = 1
        valid = 1
        for release in metadata['parts']:
            chp_title = release['title']

            dt = datetime.datetime.strptime(release['modifyDate'],
                                            "%Y-%m-%dT%H:%M:%SZ")
            reldate = calendar.timegm(dt.timetuple())

            raw_item = {}
            raw_item['srcname'] = "WattPad"
            raw_item['published'] = reldate
            raw_item['linkUrl'] = release['url']
            msg = msgpackers._buildReleaseMessage(raw_item,
                                                  title,
                                                  None,
                                                  index,
                                                  None,
                                                  author=author,
                                                  postfix=chp_title,
                                                  tl_type='oel',
                                                  extraData=extra,
                                                  matchAuthor=True)
            retval.append(msg)

            # Check if there was substantive structure in the chapter
            # name. Used as a crude heuristic for chapter validity.
            # vol, chp, frag, post = extractTitle(chp_title)
            # if any((vol, chp, frag)):
            # 	# print("Valid: ", (vol, chp, frag))
            # 	valid += 1

            index += 1

        # if valid < (index/2):
        # 	print("Half the present chapters are have no numeric content?")
        # 	return []

        # Don't send the series metadata if we didn't find any chapters.
        if not retval:
            print("No chapters!")
            return []

        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = desc
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'WattPad'

        pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                beta=IS_BETA,
                                                matchAuthor=True)
        self.log.info("Wattpad scraper generated %s amqp messages!",
                      len(retval) + 1)
        self.amqp_put_item(pkt)
        return retval
Ejemplo n.º 14
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        # Yeah, the title text is in a div with an id of "titlePic".
        # The actual image is in a div with the /class/ titlePic
        # wat.
        titlecontainer = soup.find("div", id='titlePic')
        if not titlecontainer:
            titlecontainer = soup.find("div", id='title')
        if not titlecontainer:
            raise ValueError("No title at URL: '%s'", seriesPageUrl)

        titletg = titlecontainer.h1
        typetg, authortg, categorytg = titlecontainer.find_all("a")

        if "novel" not in typetg.get_text().lower():
            return []

        if not titletg:
            return []
        if not authortg:
            return []

        title = titletg.get_text()
        author = authortg.get_text()
        genre = categorytg.get_text()

        descDiv = soup.find('p', class_='summary')
        for item in descDiv.find_all("a"):
            item.decompose()
        desc = [
            item.strip() for item in descDiv.find_all(text=True)
            if item.strip()
        ]

        tagdiv = soup.find("div", id='cloudMain')

        tags = []
        # Skip if no tags
        if tagdiv:
            tags = [
                item.get_text().strip().lower()
                for item in tagdiv.find_all("a")
            ]

        tags.append(genre.lower())
        # Fix a lot of the stupid tag fuckups I've seen.
        # People are stupid.
        if 'science' in tags and 'fiction' in tags:
            tags.append("science-fiction")
        tags = [tag for tag in tags if tag not in BAD_TAGS]
        tags = [tag for tag in tags if len(tag) > 2]
        tags = [tag.replace("  ", " ").replace(" ", "-") for tag in tags]
        tags = list(set(tags))

        if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]):
            self.log.info("Missing required tags!")
            return []
        if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]):
            self.log.info("Masked tag!")
            return []

        # Wrap the paragraphs in p tags.
        desc = ['<p>{text}</p>'.format(text=para) for para in desc]

        seriesmeta = {}
        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\n\n ".join([str(para) for para in desc])
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'Booksie'

        pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                beta=IS_BETA,
                                                matchAuthor=True)

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'Booksie'

        # Decompose the announcement (?) div that's cluttering up the
        # search for the chapterdiv
        badchp = soup.find("div", class_='chapters', id='noticeMessage')
        badchp.decompose()

        chapters = soup.find("div", class_='chapters')
        releases = chapters.find_all('a')

        retval = []
        for release in releases:

            # No post time, unfortunately
            chp = int(release.get_text())
            reldate = time.time()

            # Force releases to the beginning of time untill we catch up.
            reldate = 0

            vol = None
            frag = None

            raw_item = {}
            raw_item['srcname'] = "Booksie"
            raw_item['published'] = reldate
            raw_item['linkUrl'] = release['href']

            msg = msgpackers.buildReleaseMessage(raw_item,
                                                 title,
                                                 vol,
                                                 chp,
                                                 frag,
                                                 author=author,
                                                 tl_type='oel',
                                                 extraData=extra,
                                                 matchAuthor=True)
            retval.append(msg)

        if not retval:
            print("No releases?")
            return []
        self.amqp_put_item(pkt)
        return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):

		match = self.match_re.search(seriesPageUrl)
		series_id = match.group(1)
		conf = load_lut()

		assert 'force_sequential_numbering' in conf

		must_renumber = series_id in conf['force_sequential_numbering']


		# print("")
		# print("Match: ", match, match.groups(), series_id)
		# print("series_id", series_id)
		# print("Renumber:", must_renumber)


		header   = soup.find("div", class_='fic-title')
		titletg  = header.find("h2")
		authortg = header.find("h4")
		authortg.find("span").decompose()

		ratingtg_type_1 = soup.find("div", class_='rating')
		ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score'))


		if ratingtg_type_1:
			startg = ratingtg_type_1.find("span", class_='star')
		elif ratingtg_type_2:
			# print(ratingtg_type_2)
			starcontainer = ratingtg_type_2.find_next_sibling("li")
			if not starcontainer:
				self.log.error("Could not find rating tag (starcontainer)!")
				return []
			startg = starcontainer.find("span", class_='star')
			if not startg:
				self.log.error("Could not find rating tag (startg)!")
				return []


		else:
			self.log.error("Could not find rating tag!")
			return []

		ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)]
		rating = ratingcls[0].split("-")[-1]

		rating = float(rating) / 10
		rating = rating * 2  # Normalize to 1-10 scale
		# print(startg['class'])
		if not ratingcls:
			return []

		if not rating >= MIN_RATING and rating != 0.0:
			self.log.error("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.error("Could not find title tag!")
			return []
		if not authortg:
			self.log.error("Could not find author tag!")
			return []


		title  = titletg.get_text().strip()
		author = authortg.get_text().strip()



		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		if not descDiv or not descDiv.div:
			self.log.error("Incomplete or broken description?")
			return []

		desc = []
		for segment in descDiv.div:
			if isinstance(segment, bs4.NavigableString):
				desc.append(str(segment).strip())
			else:
				if segment.get_text().strip():
					desc.append(segment.get_text().strip())

		desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
		# print(desc)

		tags = []
		tagdiv = soup.find('div', class_='tags')
		for tag in tagdiv.find_all('span', class_='label'):
			tagtxt = tag.get_text().strip().lower().replace(" ", "-")
			# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
			if tagtxt in conf['tag_rename']:
				tagtxt = conf['tag_rename'][tagtxt]
			tags.append(tagtxt)

		info_div = soup.find("div", class_='fiction-info')
		warning_div = info_div.find("div", class_='font-red-sunglo')
		if warning_div:
			for warning_tag in warning_div.find_all('li'):
				tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-")
				# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
				if tagtxt in conf['tag_rename']:
					tagtxt = conf['tag_rename'][tagtxt]
				tags.append(tagtxt)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = "\r\n".join(desc)
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'
		seriesmeta['create_tags'] = True


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)
		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find_all("tr", attrs={"data-url" : True})

		raw_retval = []
		for chapter in chapters:
			if len(chapter.find_all("td")) != 2:
				self.log.warning("Row with invalid number of entries?")
				continue
			cname, cdate = chapter.find_all("td")

			reldate = cdate.time['unixtime']
			relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl)


			chp_title = cname.get_text().strip()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = float(reldate)
			raw_item['linkUrl']   = relurl

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
				if must_renumber:
					self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.")
				else:
					self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval
Ejemplo n.º 16
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)
        conf = load_lut()

        assert 'force_sequential_numbering' in conf

        must_renumber = series_id in conf['force_sequential_numbering']

        # print("")
        # print("Match: ", match, match.groups(), series_id)
        # print("series_id", series_id)
        # print("Renumber:", must_renumber)

        header = soup.find("div", class_='fic-title')
        titletg = header.find("h1")
        authortg = header.find("h4")
        authortg.find("span").decompose()

        rating_val = soup.find("meta", property='books:rating:value')
        rating_scale = soup.find("meta", property='books:rating:scale')

        print("Rating value:", rating_val)
        print("Rating scale:", rating_scale)

        if not rating_val or not rating_scale:
            return []

        rval_f = float(rating_val.get('content', "0"))
        rscale_f = float(rating_scale.get('content', "999999"))

        rating = 5 * (rval_f / rscale_f)

        print("Float rating: ", rating)

        if not rating >= MIN_RATING and rating != 0.0:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if not titletg:
            self.log.error("Could not find title tag!")
            return []
        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='description')
        if not descDiv or not descDiv.div:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv.div:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
        # print(desc)

        tags = []
        tagdiv = soup.find('span', class_='tags')
        for tag in tagdiv.find_all('span', class_='label'):
            tagtxt = tag.get_text().strip().lower().replace(" ", "-")
            # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
            if tagtxt in conf['tag_rename']:
                tagtxt = conf['tag_rename'][tagtxt]
            tags.append(tagtxt)

        info_div = soup.find("div", class_='fiction-info')
        warning_div = info_div.find("div", class_='font-red-sunglo')
        if warning_div:
            for warning_tag in warning_div.find_all('li'):
                tagtxt = warning_tag.get_text().strip().lower().replace(
                    " ", "-")
                # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
                if tagtxt in conf['tag_rename']:
                    tagtxt = conf['tag_rename'][tagtxt]
                tags.append(tagtxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        chapters = soup.find_all("tr", attrs={"data-url": True})

        raw_retval = []
        for chapter in chapters:
            if len(chapter.find_all("td")) != 2:
                self.log.warning("Row with invalid number of entries?")
                continue
            cname, cdate = chapter.find_all("td")

            reldate = cdate.time['unixtime']
            relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, post = extractTitle(chp_title + " " + title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                     title,
                                                     vol,
                                                     chp,
                                                     frag,
                                                     author=author,
                                                     postfix=chp_title,
                                                     tl_type='oel',
                                                     extraData=extra,
                                                     matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        missing_chap = 0
        for item in raw_retval:
            if not (item['vol'] or item['chp']):
                missing_chap += 1

        if len(raw_retval):
            unnumbered = (missing_chap / len(raw_retval)) * 100
            if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
                if must_renumber:
                    self.log.warning(
                        "Item numbering force-overridden! Adding simple sequential chapter numbers."
                    )
                else:
                    self.log.warning(
                        "Item seems to not have numbered chapters. Adding simple sequential chapter numbers."
                    )
                chap = 1
                for item in raw_retval:
                    item['vol'] = None
                    item['chp'] = chap
                    chap += 1

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        # self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			return []

		if not float(ratingtg['score']) >= MIN_RATING:
			return []

		if not titletg:
			return []
		if not authortg:
			return []
		if not ratingtg:
			return []

		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		pkt = msgpackers.sendSeriesInfoPacket(seriesmeta)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra)
			retval.append(msg)

		# Do not add series without 3 chapters.
		if len(retval) < 3:
			return []

		if not retval:
			return []
		self.amqp_put_item(pkt)
		return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			self.log.info("Could not find rating tag!")
			return []


		rating = float(ratingtg['score'])
		if not rating >= MIN_RATING and rating != 0.0:
			self.log.info("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.info("Could not find title tag!")
			return []
		if not authortg:
			self.log.info("Could not find author tag!")
			return []


		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		raw_retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if len(raw_retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval
Ejemplo n.º 19
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        titletg = soup.find("h4", class_='seriestitle')
        if not titletg:
            titletg = soup.find("div", class_='seriestitlenu')
        altnametg = soup.find("div", id='editassociated')
        descrtg = soup.find("div", id='editdescription')

        link_sets = {
            'authortg': soup.find("div", id='showauthors'),
            'artisttg': soup.find("div", id='showartists'),
            'langtg': soup.find("div", id='showlang'),
            'genretg': soup.find("div", id='seriesgenre'),
            'tagstg': soup.find("div", id='showtags'),
            'typetg': soup.find("div", id='showtype'),
            'orig_pub_tg': soup.find("div", id='showopublisher'),
            'eng_pub_tg': soup.find("div", id='showepublisher'),
        }

        text_sets = {
            'transcompletetg': soup.find("div", id='showtranslated'),
            'yeartg': soup.find("div", id='edityear'),
            'coostatustg': soup.find("div", id='editstatus'),
            'licensedtg': soup.find("div", id='showlicensed'),
        }

        if not titletg:
            self.log.warn("Could not find item title!")
            self.log.warn("On URL: '%s'", seriesPageUrl)
            self.log.warn("%s", soup)
            return []

        if not altnametg:
            self.log.warn("Could not find alt-name container tag!")
            return []
        if not descrtg:
            self.log.warn("Could not find description container tag!")
            return []

        data_sets = {}
        for key in list(link_sets.keys()):
            if not link_sets[key]:
                self.log.warn("Could not find tag for name: '%s'", key)
                return []
            data_sets[key] = [
                tag.get_text() for tag in link_sets[key].find_all("a")
            ]

        for key in list(text_sets.keys()):
            if not text_sets[key]:
                self.log.warn("Could not find tag for name: '%s'", key)
                return []
            data_sets[key] = [
                tmp.strip() for tmp in text_sets[key].contents
                if isinstance(tmp, bs4.NavigableString)
            ]

        title = titletg.get_text().strip()

        data_sets['title'] = title
        data_sets['altnames'] = [
            tmp.strip() for tmp in altnametg.contents
            if isinstance(tmp, bs4.NavigableString)
        ]

        data_sets['altnames'] = [
            tmp for tmp in data_sets['altnames'] if tmp.lower() != 'n/a'
        ]

        # Scrub incoming markup
        for key in list(data_sets.keys()):
            if isinstance(data_sets[key], list):
                data_sets[key] = [
                    bleach.clean(val,
                                 tags=[],
                                 attributes=[],
                                 styles=[],
                                 strip=True,
                                 strip_comments=True).strip()
                    for val in data_sets[key]
                ]
            else:
                data_sets[key] = bleach.clean(data_sets[key],
                                              tags=[],
                                              attributes=[],
                                              styles=[],
                                              strip=True,
                                              strip_comments=True).strip()

        if data_sets['yeartg'] and data_sets['yeartg'][0]:
            # print("Non-null data_sets['yeartg']:", data_sets['yeartg'])
            try:
                yearstr = data_sets['yeartg'].pop().split("-")[0]
                tmp_d = datetime.datetime(year=int(yearstr), month=1, day=1)
                data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple())
            except ValueError:
                data_sets['yeartg'] = None
        else:
            data_sets['yeartg'] = None

        # {
        # 	'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'],
        # 	'orig_pub_tg': ['Media Factory'],
        # 	'eng_pub_tg': [],
        # 	'typetg': ['Web Novel'],
        # 	'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'],
        # 	'licensedtg': ['No'],
        # 	'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'],
        # 	'authortg': ['Iida K'],
        # 	'artisttg': ['Shimotsuki Eito'],
        # 	'title': 'Sendai Yuusha wa Inkyou Shitai',
        # 	'description': '<p>\n  Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n  However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>',
        # 	'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'],
        # 	'langtg': ['Japanese'],
        # 	'yeartg': ['2013']

        # 	'transcompletetg': ['No'],
        # }

        data_sets['description'] = bleach.clean(descrtg.prettify(),
                                                tags=[
                                                    'a', 'abbr', 'acronym',
                                                    'b', 'blockquote', 'code',
                                                    'em', 'i', 'li', 'ol',
                                                    'strong', 'ul', 'p'
                                                ],
                                                strip=True).strip()

        series_message = {
            'update_only': False,
            'sourcesite': "NovelUpdates",
            'title': data_sets['title'],
            'alt_titles': data_sets['altnames'] + [
                data_sets['title'],
            ],
            'desc': data_sets['description'],
            # 'homepage'      : data_sets[''],
            'author': data_sets['authortg'],
            'illust': data_sets['artisttg'],
            'pubdate': data_sets['yeartg'],
            'pubnames': data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'],
            # 'sourcesite'    : data_sets[''],
            'tags': data_sets['tagstg'],

            # AFICT, NovelUpdates doesn't have any english items, but wth.
            'tl_type':
            "translated" if 'English' not in data_sets['langtg'] else "oel",

            # New:
            'coostate': "<br />".join(data_sets['coostatustg']),
            'type': data_sets['typetg'],
            'genres': data_sets['genretg'],
            'licensed': "<br />".join(data_sets['licensedtg']),
            'transcomplete': "<br />".join(data_sets['transcompletetg']),
            'create_tags': True,
        }
        # pprint.pprint(series_message)
        series_info_packet = msgpackers.createSeriesInfoPacket(
            series_message, matchAuthor=True, beta=self.is_beta)
        # print(series_info_packet)

        extra = {}
        extra['tags'] = data_sets['tagstg']
        # extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'Unknown'

        chapter_tbl = soup.find("table", id='myTable')
        if not chapter_tbl:
            self.log.error("No chapter table!")
            return

        releases = chapter_tbl.find_all("tr")

        masked_classes = self.getMaskedClasses(soup)

        valid_releases = 0
        for release in releases:
            items = release.find_all("td")
            if len(items) != 3:
                continue

            date_tg, group_tg, chp_tg = items

            rel = datetime.datetime.strptime(date_tg.get_text().strip(),
                                             '%m/%d/%y')
            if rel.date() == datetime.date.today():
                reldate = datetime.datetime.now()
            else:
                reldate = datetime.datetime.fromtimestamp(
                    calendar.timegm(rel.timetuple()))

            print("Release date: ", reldate)

            release_info = chp_tg.get_text().strip()
            group_name = group_tg.get_text().strip()
            group_name = msgpackers.fixSmartQuotes(group_name)

            linkas = release.find_all('a', class_='chp-release')

            for link in linkas:
                bad = any([tmp in masked_classes for tmp in link['class']])
                if not bad:
                    linkfq = link['href']
                    if linkfq.startswith("//"):
                        linkfq = "https:" + linkfq
                    if "http://" in linkfq:
                        linkfq = linkfq.split("http://")[0]

                    if group_name == 'Qidian International':
                        self.log.info("Qidian item. Skipping.")

                    elif group_name == 'Webnovel':
                        self.log.info("Qidian item. Skipping.")

                    else:

                        changed = upsertNuItem(
                            self.raw_cur, {
                                'seriesname': title,
                                'releaseinfo': release_info,
                                'groupinfo': group_name,
                                'referrer': seriesPageUrl,
                                'outbound_wrapper': linkfq,
                                'release_date': reldate,
                                'first_seen': datetime.datetime.now(),
                            })
                        self.log.info(
                            "Upserting outbound wrapper url %s, changed %s rows.",
                            linkfq, changed)

                        if changed:
                            self.mon_con.incr('new-urls', 1)

            valid_releases += 1

        self.log.info("Found %s releases on page!", valid_releases)
        self.log.info("Committing!")
        self.raw_cur.execute("COMMIT;")
        self.log.info("Committed!")

        # Do not add series without 3 chapters.
        # if valid_releases < 3:
        # 	self.log.warning("Less then three chapters!")
        # 	return

        self.amqp_put_item(series_info_packet)
        return
	def extractSeriesReleases(self, seriesPageUrl, soup):
		title  = soup.find("div", class_='fanfic_title_div').get_text()
		author = soup.find("div", class_='fanfic_author_div').get_text()
		ratingtg = soup.find("div", class_='fanfic_title_wrapper')
		ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)]
		if not ratingtg:
			ratingtg = ''
		else:
			ratingtg = ratingtg.pop()


		rating, views, chapters = ratingtg.split("·")

		# I think the japtem rating system is just plain out broken.
		if not "no rating" in ratingtg.lower():
			rating_score = float(rating.split()[-1])
			if not rating_score >= MIN_RATING:
				return []


		chapter_num = float(chapters.split()[0])
		if chapter_num < 3:
			return []



		if not title:
			return []
		if not author:
			return []


		descDiv = soup.find('div', class_='fanfic_synopsis')

		if not descDiv:
			print(soup)

		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = ''
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'JapTem'


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = ''
		extra['sourcesite']  = 'JapTem'

		retval = []

		chapters = soup.find("ul", class_='fanfic_chapter_list')
		volumes = chapters.find_all('li', class_='fanfic_volume')
		for volume in volumes:
			releases = volume.find_all('li', class_='fanfic_chapter')
			for release in releases:
				chp_title = release.find("a")

				vol_str = volume.find('div', class_='fanfic_volume_title').get_text()
				reldate = time.time()

				chp_title = chp_title.get_text()

				agg_title = " ".join((vol_str, chp_title))
				# print("Chp title: '{}'".format(chp_title))
				vol, chp, frag, post = extractTitle(agg_title)
				raw_item = {}
				raw_item['srcname']   = "JapTem"
				raw_item['published'] = reldate
				releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href'])
				raw_item['linkUrl']   = releaseurl

				msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra)
				msg = msgpackers.createReleasePacket(msg)

				retval.append(msg)
		if not retval:
			return []

		retval.append(meta_pkt)
		# return []
		return retval
    def extractSeriesReleases(self, seriesPageUrl, soup):

        # Yeah, the title text is in a div with an id of "titlePic".
        # The actual image is in a div with the /class/ titlePic
        # wat.
        titlecontainer = soup.find("div", id="titlePic")
        if not titlecontainer:
            titlecontainer = soup.find("div", id="title")
        if not titlecontainer:
            raise ValueError("No title at URL: '%s'", seriesPageUrl)

        titletg = titlecontainer.h1
        typetg, authortg, categorytg = titlecontainer.find_all("a")

        if "novel" not in typetg.get_text().lower():
            return []

        if not titletg:
            return []
        if not authortg:
            return []

        title = titletg.get_text()
        author = authortg.get_text()
        genre = categorytg.get_text()

        descDiv = soup.find("p", class_="summary")
        for item in descDiv.find_all("a"):
            item.decompose()
        desc = [item.strip() for item in descDiv.find_all(text=True) if item.strip()]

        tagdiv = soup.find("div", id="cloudMain")

        tags = []
        # Skip if no tags
        if tagdiv:
            tags = [item.get_text().strip().lower() for item in tagdiv.find_all("a")]

        tags.append(genre.lower())
        # Fix a lot of the stupid tag fuckups I've seen.
        # People are stupid.
        if "science" in tags and "fiction" in tags:
            tags.append("science-fiction")
        tags = [tag for tag in tags if tag not in BAD_TAGS]
        tags = [tag for tag in tags if len(tag) > 2]
        tags = [tag.replace("  ", " ").replace(" ", "-") for tag in tags]
        tags = list(set(tags))

        if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]):
            self.log.info("Missing required tags!")
            return []
        if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]):
            self.log.info("Masked tag!")
            return []

            # Wrap the paragraphs in p tags.
        desc = ["<p>{text}</p>".format(text=para) for para in desc]

        seriesmeta = {}
        seriesmeta["title"] = title
        seriesmeta["author"] = author
        seriesmeta["tags"] = tags
        seriesmeta["homepage"] = seriesPageUrl
        seriesmeta["desc"] = "\n\n ".join([str(para) for para in desc])
        seriesmeta["tl_type"] = "oel"
        seriesmeta["sourcesite"] = "Booksie"

        pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True)

        extra = {}
        extra["tags"] = tags
        extra["homepage"] = seriesPageUrl
        extra["sourcesite"] = "Booksie"

        # Decompose the announcement (?) div that's cluttering up the
        # search for the chapterdiv
        badchp = soup.find("div", class_="chapters", id="noticeMessage")
        badchp.decompose()

        chapters = soup.find("div", class_="chapters")
        releases = chapters.find_all("a")

        retval = []
        for release in releases:

            # No post time, unfortunately
            chp = int(release.get_text())
            reldate = time.time()

            # Force releases to the beginning of time untill we catch up.
            reldate = 0

            vol = None
            frag = None

            raw_item = {}
            raw_item["srcname"] = "Booksie"
            raw_item["published"] = reldate
            raw_item["linkUrl"] = release["href"]

            msg = msgpackers.buildReleaseMessage(
                raw_item, title, vol, chp, frag, author=author, tl_type="oel", extraData=extra, matchAuthor=True
            )
            retval.append(msg)

        if not retval:
            print("No releases?")
            return []
        self.amqp_put_item(pkt)
        return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			self.log.info("Could not find rating tag!")
			return []


		rating = float(ratingtg['score'])
		if not rating >= MIN_RATING and rating != 0.0:
			self.log.info("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.info("Could not find title tag!")
			return []
		if not authortg:
			self.log.info("Could not find author tag!")
			return []


		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)
			retval.append(msg)

		missing_chap = 0
		for item in retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(retval):
			unnumbered = (missing_chap/len(retval)) * 100
			if len(retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(retval) < 3:
			self.log.info("Less then three chapters!")
			return []



		if not retval:
			self.log.info("Retval empty?!")
			return []
		self.amqp_put_item(pkt)
		return retval
	def sendReleases(self, releases):
		self.log.info("Total releases found on page: %s", len(releases))
		for release in releases:
			pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
			self.amqp_put_item(pkt)
	def sendReleases(self, releases):
		self.log.info("Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases))
		for release in releases:
			pkt = msgpackers.createReleasePacket(release, beta=self.is_beta)
			self.amqp_put_item(pkt)
Ejemplo n.º 25
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)

        titletg = soup.find("div", class_='fic_title')
        authortg = soup.find("span", class_='auth_name_fic')

        if not titletg:
            self.log.error("Could not find title tag!")
            return []

        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        metas = soup.find_all("script", type="application/ld+json")
        agg_meta = {}
        for meta in metas:
            loaded = json.loads(meta.get_text())
            for k, v in loaded.items():
                agg_meta[k] = v

        rating = float(agg_meta.get('ratingValue', "0"))
        rating_cnt = float(agg_meta.get('ratingCount', "0"))

        self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt)

        if rating < SeriesPageCommon.MIN_RATING_STARS:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if rating_cnt < SeriesPageCommon.MIN_RATE_CNT:
            self.log.error("Item has insufficent ratings: %s", rating_cnt)
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='wi_fic_desc')
        if not descDiv or not descDiv.p:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]

        tags = []
        tagdiv = soup.find('span', class_='wi_fic_showtags')
        for tag in tagdiv.find_all('a', class_='stag'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        # These are separate on SH, but I'm just treating them as tags.
        for tag in soup.find_all('li', class_='mature_contains'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        genres = []
        genrediv = soup.find('span', class_='wi_fic_genre')
        for genre in genrediv.find_all('a', class_='fic_genre'):
            genretxt = SeriesPageCommon.clean_tag(genre.get_text())
            genretxt = SeriesPageCommon.fix_genre(genretxt)
            genres.append(genretxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'ScribbleHub'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['genres'] = genres
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'ScribbleHub'

        self.log.info("Found %s tags, %s genres", len(tags), len(genres))

        chapters = soup.find_all("li", class_='toc_w')

        raw_retval = []
        for chapter in chapters:

            cname, cdate = chapter.a, chapter.span

            if not (cname and cdate):
                self.log.warning("Row with invalid number of entries?")
                continue

            if not cdate.get("title"):
                self.log.error("No time entry?")
                continue

            timestr = cdate.get("title").strip()
            itemDate, status = parsedatetime.Calendar().parse(timestr)

            if status < 1:
                self.log.warning("Failure processing date: %s", timestr)
                continue

            reldate = time.mktime(itemDate)

            relurl = common.util.urlFuncs.rebaseUrl(cname['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "ScribbleHub"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series_id,
                                                          sh=True)

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ] + [meta_pkt]

        self.log.info("Found %s chapter releases on series page!", len(retval))
        return retval
    def process_series(self, series):

        expected_keys = [
            'chapters', 'cover', 'description', 'firstUpdate', 'id',
            'lastUpdate', 'tags', 'title'
        ]
        if not all([tmp in series for tmp in expected_keys]):
            self.log.error("Missing key(s) %s from series %s. Cannot continue",
                           [tmp for tmp in expected_keys if not tmp in series],
                           series)
            return

        # {
        # 	'topCover': None,
        # 	'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short "
        #  "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now "
        #  "he has to return and rule the ruin that is his family's lands. Bandits roam&nbsp;and enemies leer. Conspiracies brew and wars rage. "
        #  "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p>&nbsp;</p>\n<p>Schedule: Updates 4 times a week--&gt; Monday-"
        #  "Thursday.</p>\n<p>&nbsp;</p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>",
        # 	'id': 19290,
        # 	'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48),
        # 	'topCoverAlignment': 0,
        # 	'chapters': [{'title': 'Chapter 33',
        # 	'fictionId': 19290,
        # 	'date': datetime.datetime(2018, 8, 28, 1, 55, 48),
        # 	'id': 285611}],
        # 	'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg',
        # 	'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore',
        # 	'title': 'Rise of the Lord',
        # 	'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48)
        #  }

        sinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}".
            format(sid=series['id'], key=settings.RRL_API_KEY))

        if not self.validate_sdata(sinfo):
            self.log.warning("Series data for sid %s failed validation" %
                             series['id'])
            return

        assert int(series['id']) == int(
            sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % (
                series['id'],
                sinfo['id'],
                type(series['id']),
                type(sinfo['id']),
            )

        cinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}"
            .format(sid=series['id'], key=settings.RRL_API_KEY))
        if not self.validate_cdata(cinfo):
            return

        # Order matters! If ratingCount is 0, ratingValue is None (not 0)
        if sinfo.get('ratingCount',
                     0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get(
                         'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT:
            return

        author = sinfo.get("authorName")

        if not author:
            self.log.error("Could not find author for series '%s'",
                           series['id'])
            return

        if isinstance(sinfo['tags'], str):
            tags = sinfo['tags'].split(",")
        elif isinstance(sinfo['tags'], (list, tuple)):
            tags = list(sinfo['tags'])
        else:
            print("sinfo unknown type: ", sinfo['tags'])
            print("Sinfo: ", sinfo)

        tags = [SeriesPageCommon.fix_tag(tag) for tag in tags]

        description = self.extract_description(sinfo['description'])

        title = sinfo['title'].strip()

        seriesmeta = {}

        seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format(
            sid=series['id'])

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = description
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True
        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        trigger_urls = [seriesPageUrl]

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        raw_retval = []
        for chapter in cinfo:

            reldate = chapter['date']
            chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format(
                # sid = series['id'],
                cid=chapter['id'], )

            chp_title = chapter['title']
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = chap_url

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            trigger_urls.append(chap_url)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series['id'],
                                                          rrl=True)

        self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        self.amqp_put_many(retval)
        self.low_priority_links_trigger(trigger_urls)
	def extractSeriesReleases(self, seriesPageUrl, metadata, soup):

		title  = metadata['title']
		author = metadata['user']['name']
		desc   = metadata['description']
		tags   = metadata['tags']

		# Apparently the description is rendered in a <pre> tag.
		# Huh?
		desc = markdown.markdown(desc, extensions=["linkify"])

		title = title.strip()

		# Siiiiiigh. Really?
		title = title.replace("[#wattys2015]", "")
		title = title.replace("(Wattys2015) ", "")
		title = title.replace("#Wattys2015", "")
		title = title.replace("Wattys2015", "")
		title = title.strip()

		if metadata['numParts'] < 3:
			return []
		if metadata['voteCount'] < 100:
			return []

		# Language ID 1 is english.
		if metadata['language']['id'] != 1:
			return []

		# Allow blocking of item by ID
		if metadata['id'] in BLOCK_IDS:
			return []

		# for some particularly stupid reasons, the item category tag is
		# not included in the metadata.
		# therefore, we parse it out from the page manually.
		tagdiv = soup.find("div", class_="tags")
		if tagdiv:
			for tag in tagdiv.find_all("a", class_='tag'):
				tags.append(tag.get_text())


		tags = list(set([item.lower().strip().replace("  ", " ").replace(" ", "-") for item in tags]))

		# Mask any content with any of the blocked tags.
		if any([item in tags for item in WATTPAD_MASKED_TAGS]):
			self.log.warning("Item has a masked tag. Not emitting any releases.")
			self.log.warning("Tags: '%s'", tags)
			return

		# And check that at least one of the target tags is present.
		if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]):
			self.log.warning("Item missing required tag. Not emitting any releases.")
			self.log.warning("Tags: '%s'", tags)
			return


		seriesmeta = {}

		extra = {}
		extra['tags']        = tags[:]
		extra['homepage']    = seriesPageUrl
		extra['sourcesite']  = 'WattPad'



		retval = []
		index = 1
		valid = 1
		for release in metadata['parts']:
			chp_title = release['title']

			dt = datetime.datetime.strptime(release['modifyDate'], "%Y-%m-%dT%H:%M:%SZ" )
			reldate = calendar.timegm(dt.timetuple())

			raw_item = {}
			raw_item['srcname']   = "WattPad"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release['url']
			msg = msgpackers.buildReleaseMessage(raw_item, title, None, index, None, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)
			retval.append(msg)

			# Check if there was substantive structure in the chapter
			# name. Used as a crude heuristic for chapter validity.
			# vol, chp, frag, post = extractTitle(chp_title)
			# if any((vol, chp, frag)):
			# 	# print("Valid: ", (vol, chp, frag))
			# 	valid += 1

			index += 1

		# if valid < (index/2):
		# 	print("Half the present chapters are have no numeric content?")
		# 	return []

		# Don't send the series metadata if we didn't find any chapters.
		if not retval:
			print("No chapters!")
			return []


		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = desc
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'WattPad'


		pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True)
		self.log.info("Wattpad scraper generated %s amqp messages!", len(retval) + 1)
		self.amqp_put_item(pkt)
		return retval
Ejemplo n.º 28
0
    def extractSeriesReleases(self, seriesPageUrl, soup):
        title = soup.find("div", class_='fanfic_title_div').get_text()
        author = soup.find("div", class_='fanfic_author_div').get_text()
        ratingtg = soup.find("div", class_='fanfic_title_wrapper')
        ratingtg = [
            item for item in ratingtg.contents if "Rating" in str(item)
        ]
        if not ratingtg:
            ratingtg = ''
        else:
            ratingtg = ratingtg.pop()

        rating, views, chapters = ratingtg.split("·")

        # I think the japtem rating system is just plain out broken.
        if not "no rating" in ratingtg.lower():
            rating_score = float(rating.split()[-1])
            if not rating_score >= MIN_RATING:
                return []

        chapter_num = float(chapters.split()[0])
        if chapter_num < 3:
            return []

        if not title:
            return []
        if not author:
            return []

        descDiv = soup.find('div', class_='fanfic_synopsis')

        if not descDiv:
            print(soup)

        paras = descDiv.find_all("p")
        tags = []

        desc = []
        for para, text in [(para, para.get_text()) for para in paras]:
            if text.lower().startswith('categories:'):
                tagstr = text.split(":", 1)[-1]
                items = tagstr.split(",")
                [tags.append(item.strip()) for item in items if item.strip()]
            else:
                desc.append(para)

        seriesmeta = {}

        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = ''
        seriesmeta['desc'] = " ".join([str(para) for para in desc])
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'JapTem Fanfic'

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = ''
        extra['sourcesite'] = 'JapTem Fanfic'

        retval = []

        chapters = soup.find("ul", class_='fanfic_chapter_list')
        volumes = chapters.find_all('li', class_='fanfic_volume')
        for volume in volumes:
            releases = volume.find_all('li', class_='fanfic_chapter')
            for release in releases:
                chp_title = release.find("a")

                vol_str = volume.find('div',
                                      class_='fanfic_volume_title').get_text()
                reldate = time.time()

                chp_title = chp_title.get_text()

                agg_title = " ".join((vol_str, chp_title))
                vol, chp, frag, post = extractTitle(agg_title)

                raw_item = {}
                raw_item['srcname'] = 'JapTem Fanfic'
                raw_item['published'] = reldate
                releaseurl = urllib.parse.urljoin(seriesPageUrl,
                                                  release.a['href'])
                raw_item['linkUrl'] = releaseurl

                raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                         title,
                                                         vol,
                                                         chp,
                                                         frag,
                                                         author=author,
                                                         postfix=chp_title,
                                                         tl_type='oel',
                                                         extraData=extra,
                                                         matchAuthor=True)
                msg = msgpackers.createReleasePacket(raw_msg)

                retval.append(msg)

        if not retval:
            return []

        self.amqp_put_item(meta_pkt)
        return retval