Ejemplo n.º 1
0
 def sendReleases(self, releases):
     self.log.info(
         "Total releases found on page: %s. Emitting messages into AMQP local queue.",
         len(releases))
     for release in releases:
         pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
         self.amqp_put_item(pkt)
    def extractSeriesReleases(self, row):

        tds = row.find_all("td")
        if len(tds) != 4:
            self.log.warning(
                "Row does not have four <td> tags! Don't know how to handle")
            pdtag = row.prettify()
            for line in pdtag.split("\n"):
                self.log.warning(line)

            return None
        title_td, ch_td, trans_td, release_td = tds

        title = title_td.find("div", class_='ellipsis-1').get_text(strip=True)

        author = trans_td.get_text(strip=True)

        if not title:
            return None
        if not author:
            return None

        # Cripes this is probably brittle
        series_type = "translated" if "," in author else "oel"

        reldate = float(release_td.span['data-timestamp'])

        chp_title = ch_td.get_text(strip=True)

        vol, chp, frag, _ = extractTitle(chp_title)

        raw_item = {}
        raw_item['srcname'] = 'FoxTeller'
        raw_item['published'] = reldate
        raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com",
                                                   ch_td.a['href'])

        raw_msg = msgpackers._buildReleaseMessage(
            raw_item=raw_item,
            series=title,
            vol=vol,
            chap=chp,
            frag=frag,
            # author      = author,
            postfix=chp_title,
            tl_type=series_type,
            # matchAuthor = True,
            # looseMatch  = True
        )

        msg = msgpackers.createReleasePacket(raw_msg)

        return msg
Ejemplo n.º 3
0
	def dispatchBT(self, itemurl, itemtxt):
		titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0]
		probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip()

		vol, chp, frag, post = extractTitle(titleonly)

		raw_item = {}
		raw_item['srcname']   = "Baka-Tsuki"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)
Ejemplo n.º 4
0
	def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
		itemtitle = NANO_DESU_MAP[netloc]
		vol, chp, frag, post = extractTitle(itemtxt)
		if not (vol or chp):
			return None

		raw_item = {}
		raw_item['srcname']   = "Nano Desu"
		raw_item['published'] = time.time()
		raw_item['linkUrl']   = itemurl

		self.put_page_link(itemurl)

		msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post)
		msg = msgpackers.createReleasePacket(msg)
		return msg
Ejemplo n.º 5
0
    def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
        itemtitle = NANO_DESU_MAP[netloc]
        vol, chp, frag, post = extractTitle(itemtxt)
        if not (vol or chp):
            return None

        raw_item = {}
        raw_item['srcname'] = "Nano Desu"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             itemtitle,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)
        return msg
Ejemplo n.º 6
0
    def dispatchBT(self, itemurl, itemtxt):
        titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split(
            "BY")[0]
        probSeries = titleonly.lower().split("volume")[0].split(
            "chapter")[0].strip()

        vol, chp, frag, post = extractTitle(titleonly)

        raw_item = {}
        raw_item['srcname'] = "Baka-Tsuki"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.put_page_link(itemurl)

        msg = msgpackers.buildReleaseMessage(raw_item,
                                             probSeries,
                                             vol,
                                             chp,
                                             frag,
                                             postfix=post)
        msg = msgpackers.createReleasePacket(msg)
Ejemplo n.º 7
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)

        titletg = soup.find("div", class_='fic_title')
        authortg = soup.find("span", class_='auth_name_fic')

        if not titletg:
            self.log.error("Could not find title tag!")
            return []

        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        metas = soup.find_all("script", type="application/ld+json")
        agg_meta = {}
        for meta in metas:
            loaded = json.loads(meta.get_text())
            for k, v in loaded.items():
                agg_meta[k] = v

        rating = float(agg_meta.get('ratingValue', "0"))
        rating_cnt = float(agg_meta.get('ratingCount', "0"))

        self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt)

        if rating < SeriesPageCommon.MIN_RATING_STARS:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if rating_cnt < SeriesPageCommon.MIN_RATE_CNT:
            self.log.error("Item has insufficent ratings: %s", rating_cnt)
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='wi_fic_desc')
        if not descDiv or not descDiv.p:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]

        tags = []
        tagdiv = soup.find('span', class_='wi_fic_showtags')
        for tag in tagdiv.find_all('a', class_='stag'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        # These are separate on SH, but I'm just treating them as tags.
        for tag in soup.find_all('li', class_='mature_contains'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        genres = []
        genrediv = soup.find('span', class_='wi_fic_genre')
        for genre in genrediv.find_all('a', class_='fic_genre'):
            genretxt = SeriesPageCommon.clean_tag(genre.get_text())
            genretxt = SeriesPageCommon.fix_genre(genretxt)
            genres.append(genretxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'ScribbleHub'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['genres'] = genres
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'ScribbleHub'

        self.log.info("Found %s tags, %s genres", len(tags), len(genres))

        chapters = soup.find_all("li", class_='toc_w')

        raw_retval = []
        for chapter in chapters:

            cname, cdate = chapter.a, chapter.span

            if not (cname and cdate):
                self.log.warning("Row with invalid number of entries?")
                continue

            if not cdate.get("title"):
                self.log.error("No time entry?")
                continue

            timestr = cdate.get("title").strip()
            itemDate, status = parsedatetime.Calendar().parse(timestr)

            if status < 1:
                self.log.warning("Failure processing date: %s", timestr)
                continue

            reldate = time.mktime(itemDate)

            relurl = common.util.urlFuncs.rebaseUrl(cname['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "ScribbleHub"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series_id,
                                                          sh=True)

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ] + [meta_pkt]

        self.log.info("Found %s chapter releases on series page!", len(retval))
        return retval
Ejemplo n.º 8
0
 def sendReleases(self, releases):
     self.log.info("Total releases found on page: %s", len(releases))
     for release in releases:
         pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
         self.amqp_put_item(pkt)
	def extractSeriesReleases(self, seriesPageUrl, soup):

		match = self.match_re.search(seriesPageUrl)
		series_id = match.group(1)
		conf = load_lut()

		assert 'force_sequential_numbering' in conf

		must_renumber = series_id in conf['force_sequential_numbering']


		# print("")
		# print("Match: ", match, match.groups(), series_id)
		# print("series_id", series_id)
		# print("Renumber:", must_renumber)


		header   = soup.find("div", class_='fic-title')
		titletg  = header.find("h2")
		authortg = header.find("h4")
		authortg.find("span").decompose()

		ratingtg_type_1 = soup.find("div", class_='rating')
		ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score'))


		if ratingtg_type_1:
			startg = ratingtg_type_1.find("span", class_='star')
		elif ratingtg_type_2:
			# print(ratingtg_type_2)
			starcontainer = ratingtg_type_2.find_next_sibling("li")
			if not starcontainer:
				self.log.error("Could not find rating tag (starcontainer)!")
				return []
			startg = starcontainer.find("span", class_='star')
			if not startg:
				self.log.error("Could not find rating tag (startg)!")
				return []


		else:
			self.log.error("Could not find rating tag!")
			return []

		ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)]
		rating = ratingcls[0].split("-")[-1]

		rating = float(rating) / 10
		rating = rating * 2  # Normalize to 1-10 scale
		# print(startg['class'])
		if not ratingcls:
			return []

		if not rating >= MIN_RATING and rating != 0.0:
			self.log.error("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.error("Could not find title tag!")
			return []
		if not authortg:
			self.log.error("Could not find author tag!")
			return []


		title  = titletg.get_text().strip()
		author = authortg.get_text().strip()



		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		if not descDiv or not descDiv.div:
			self.log.error("Incomplete or broken description?")
			return []

		desc = []
		for segment in descDiv.div:
			if isinstance(segment, bs4.NavigableString):
				desc.append(str(segment).strip())
			else:
				if segment.get_text().strip():
					desc.append(segment.get_text().strip())

		desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
		# print(desc)

		tags = []
		tagdiv = soup.find('div', class_='tags')
		for tag in tagdiv.find_all('span', class_='label'):
			tagtxt = tag.get_text().strip().lower().replace(" ", "-")
			# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
			if tagtxt in conf['tag_rename']:
				tagtxt = conf['tag_rename'][tagtxt]
			tags.append(tagtxt)

		info_div = soup.find("div", class_='fiction-info')
		warning_div = info_div.find("div", class_='font-red-sunglo')
		if warning_div:
			for warning_tag in warning_div.find_all('li'):
				tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-")
				# print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
				if tagtxt in conf['tag_rename']:
					tagtxt = conf['tag_rename'][tagtxt]
				tags.append(tagtxt)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = "\r\n".join(desc)
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'
		seriesmeta['create_tags'] = True


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)
		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find_all("tr", attrs={"data-url" : True})

		raw_retval = []
		for chapter in chapters:
			if len(chapter.find_all("td")) != 2:
				self.log.warning("Row with invalid number of entries?")
				continue
			cname, cdate = chapter.find_all("td")

			reldate = cdate.time['unixtime']
			relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl)


			chp_title = cname.get_text().strip()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = float(reldate)
			raw_item['linkUrl']   = relurl

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
				if must_renumber:
					self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.")
				else:
					self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval
    def process_series(self, series):

        expected_keys = [
            'chapters', 'cover', 'description', 'firstUpdate', 'id',
            'lastUpdate', 'tags', 'title'
        ]
        if not all([tmp in series for tmp in expected_keys]):
            self.log.error("Missing key(s) %s from series %s. Cannot continue",
                           [tmp for tmp in expected_keys if not tmp in series],
                           series)
            return

        # {
        # 	'topCover': None,
        # 	'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short "
        #  "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now "
        #  "he has to return and rule the ruin that is his family's lands. Bandits roam&nbsp;and enemies leer. Conspiracies brew and wars rage. "
        #  "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p>&nbsp;</p>\n<p>Schedule: Updates 4 times a week--&gt; Monday-"
        #  "Thursday.</p>\n<p>&nbsp;</p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>",
        # 	'id': 19290,
        # 	'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48),
        # 	'topCoverAlignment': 0,
        # 	'chapters': [{'title': 'Chapter 33',
        # 	'fictionId': 19290,
        # 	'date': datetime.datetime(2018, 8, 28, 1, 55, 48),
        # 	'id': 285611}],
        # 	'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg',
        # 	'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore',
        # 	'title': 'Rise of the Lord',
        # 	'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48)
        #  }

        sinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}".
            format(sid=series['id'], key=settings.RRL_API_KEY))

        if not self.validate_sdata(sinfo):
            self.log.warning("Series data for sid %s failed validation" %
                             series['id'])
            return

        assert int(series['id']) == int(
            sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % (
                series['id'],
                sinfo['id'],
                type(series['id']),
                type(sinfo['id']),
            )

        cinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}"
            .format(sid=series['id'], key=settings.RRL_API_KEY))
        if not self.validate_cdata(cinfo):
            return

        # Order matters! If ratingCount is 0, ratingValue is None (not 0)
        if sinfo.get('ratingCount',
                     0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get(
                         'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT:
            return

        author = sinfo.get("authorName")

        if not author:
            self.log.error("Could not find author for series '%s'",
                           series['id'])
            return

        if isinstance(sinfo['tags'], str):
            tags = sinfo['tags'].split(",")
        elif isinstance(sinfo['tags'], (list, tuple)):
            tags = list(sinfo['tags'])
        else:
            print("sinfo unknown type: ", sinfo['tags'])
            print("Sinfo: ", sinfo)

        tags = [SeriesPageCommon.fix_tag(tag) for tag in tags]

        description = self.extract_description(sinfo['description'])

        title = sinfo['title'].strip()

        seriesmeta = {}

        seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format(
            sid=series['id'])

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = description
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True
        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        trigger_urls = [seriesPageUrl]

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        raw_retval = []
        for chapter in cinfo:

            reldate = chapter['date']
            chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format(
                # sid = series['id'],
                cid=chapter['id'], )

            chp_title = chapter['title']
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = chap_url

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            trigger_urls.append(chap_url)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series['id'],
                                                          rrl=True)

        self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        self.amqp_put_many(retval)
        self.low_priority_links_trigger(trigger_urls)
	def sendReleases(self, releases):
		self.log.info("Total releases found on page: %s", len(releases))
		for release in releases:
			pkt = msgpackers.createReleasePacket(release, beta=IS_BETA)
			self.amqp_put_item(pkt)
	def sendReleases(self, releases):
		self.log.info("Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases))
		for release in releases:
			pkt = msgpackers.createReleasePacket(release, beta=self.is_beta)
			self.amqp_put_item(pkt)
Ejemplo n.º 13
0
    def extractSeriesReleases(self, seriesPageUrl, soup):
        title = soup.find("div", class_='fanfic_title_div').get_text()
        author = soup.find("div", class_='fanfic_author_div').get_text()
        ratingtg = soup.find("div", class_='fanfic_title_wrapper')
        ratingtg = [
            item for item in ratingtg.contents if "Rating" in str(item)
        ]
        if not ratingtg:
            ratingtg = ''
        else:
            ratingtg = ratingtg.pop()

        rating, views, chapters = ratingtg.split("·")

        # I think the japtem rating system is just plain out broken.
        if not "no rating" in ratingtg.lower():
            rating_score = float(rating.split()[-1])
            if not rating_score >= MIN_RATING:
                return []

        chapter_num = float(chapters.split()[0])
        if chapter_num < 3:
            return []

        if not title:
            return []
        if not author:
            return []

        descDiv = soup.find('div', class_='fanfic_synopsis')

        if not descDiv:
            print(soup)

        paras = descDiv.find_all("p")
        tags = []

        desc = []
        for para, text in [(para, para.get_text()) for para in paras]:
            if text.lower().startswith('categories:'):
                tagstr = text.split(":", 1)[-1]
                items = tagstr.split(",")
                [tags.append(item.strip()) for item in items if item.strip()]
            else:
                desc.append(para)

        seriesmeta = {}

        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = ''
        seriesmeta['desc'] = " ".join([str(para) for para in desc])
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'JapTem Fanfic'

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = ''
        extra['sourcesite'] = 'JapTem Fanfic'

        retval = []

        chapters = soup.find("ul", class_='fanfic_chapter_list')
        volumes = chapters.find_all('li', class_='fanfic_volume')
        for volume in volumes:
            releases = volume.find_all('li', class_='fanfic_chapter')
            for release in releases:
                chp_title = release.find("a")

                vol_str = volume.find('div',
                                      class_='fanfic_volume_title').get_text()
                reldate = time.time()

                chp_title = chp_title.get_text()

                agg_title = " ".join((vol_str, chp_title))
                vol, chp, frag, post = extractTitle(agg_title)

                raw_item = {}
                raw_item['srcname'] = 'JapTem Fanfic'
                raw_item['published'] = reldate
                releaseurl = urllib.parse.urljoin(seriesPageUrl,
                                                  release.a['href'])
                raw_item['linkUrl'] = releaseurl

                raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                         title,
                                                         vol,
                                                         chp,
                                                         frag,
                                                         author=author,
                                                         postfix=chp_title,
                                                         tl_type='oel',
                                                         extraData=extra,
                                                         matchAuthor=True)
                msg = msgpackers.createReleasePacket(raw_msg)

                retval.append(msg)

        if not retval:
            return []

        self.amqp_put_item(meta_pkt)
        return retval
Ejemplo n.º 14
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)
        conf = load_lut()

        assert 'force_sequential_numbering' in conf

        must_renumber = series_id in conf['force_sequential_numbering']

        # print("")
        # print("Match: ", match, match.groups(), series_id)
        # print("series_id", series_id)
        # print("Renumber:", must_renumber)

        header = soup.find("div", class_='fic-title')
        titletg = header.find("h1")
        authortg = header.find("h4")
        authortg.find("span").decompose()

        rating_val = soup.find("meta", property='books:rating:value')
        rating_scale = soup.find("meta", property='books:rating:scale')

        print("Rating value:", rating_val)
        print("Rating scale:", rating_scale)

        if not rating_val or not rating_scale:
            return []

        rval_f = float(rating_val.get('content', "0"))
        rscale_f = float(rating_scale.get('content', "999999"))

        rating = 5 * (rval_f / rscale_f)

        print("Float rating: ", rating)

        if not rating >= MIN_RATING and rating != 0.0:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if not titletg:
            self.log.error("Could not find title tag!")
            return []
        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='description')
        if not descDiv or not descDiv.div:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv.div:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
        # print(desc)

        tags = []
        tagdiv = soup.find('span', class_='tags')
        for tag in tagdiv.find_all('span', class_='label'):
            tagtxt = tag.get_text().strip().lower().replace(" ", "-")
            # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
            if tagtxt in conf['tag_rename']:
                tagtxt = conf['tag_rename'][tagtxt]
            tags.append(tagtxt)

        info_div = soup.find("div", class_='fiction-info')
        warning_div = info_div.find("div", class_='font-red-sunglo')
        if warning_div:
            for warning_tag in warning_div.find_all('li'):
                tagtxt = warning_tag.get_text().strip().lower().replace(
                    " ", "-")
                # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
                if tagtxt in conf['tag_rename']:
                    tagtxt = conf['tag_rename'][tagtxt]
                tags.append(tagtxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        chapters = soup.find_all("tr", attrs={"data-url": True})

        raw_retval = []
        for chapter in chapters:
            if len(chapter.find_all("td")) != 2:
                self.log.warning("Row with invalid number of entries?")
                continue
            cname, cdate = chapter.find_all("td")

            reldate = cdate.time['unixtime']
            relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, post = extractTitle(chp_title + " " + title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                     title,
                                                     vol,
                                                     chp,
                                                     frag,
                                                     author=author,
                                                     postfix=chp_title,
                                                     tl_type='oel',
                                                     extraData=extra,
                                                     matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        missing_chap = 0
        for item in raw_retval:
            if not (item['vol'] or item['chp']):
                missing_chap += 1

        if len(raw_retval):
            unnumbered = (missing_chap / len(raw_retval)) * 100
            if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
                if must_renumber:
                    self.log.warning(
                        "Item numbering force-overridden! Adding simple sequential chapter numbers."
                    )
                else:
                    self.log.warning(
                        "Item seems to not have numbered chapters. Adding simple sequential chapter numbers."
                    )
                chap = 1
                for item in raw_retval:
                    item['vol'] = None
                    item['chp'] = chap
                    chap += 1

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        # self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):
		title  = soup.find("div", class_='fanfic_title_div').get_text()
		author = soup.find("div", class_='fanfic_author_div').get_text()
		ratingtg = soup.find("div", class_='fanfic_title_wrapper')
		ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)]
		if not ratingtg:
			ratingtg = ''
		else:
			ratingtg = ratingtg.pop()


		rating, views, chapters = ratingtg.split("·")

		# I think the japtem rating system is just plain out broken.
		if not "no rating" in ratingtg.lower():
			rating_score = float(rating.split()[-1])
			if not rating_score >= MIN_RATING:
				return []


		chapter_num = float(chapters.split()[0])
		if chapter_num < 3:
			return []



		if not title:
			return []
		if not author:
			return []


		descDiv = soup.find('div', class_='fanfic_synopsis')

		if not descDiv:
			print(soup)

		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = ''
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'JapTem'


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = ''
		extra['sourcesite']  = 'JapTem'

		retval = []

		chapters = soup.find("ul", class_='fanfic_chapter_list')
		volumes = chapters.find_all('li', class_='fanfic_volume')
		for volume in volumes:
			releases = volume.find_all('li', class_='fanfic_chapter')
			for release in releases:
				chp_title = release.find("a")

				vol_str = volume.find('div', class_='fanfic_volume_title').get_text()
				reldate = time.time()

				chp_title = chp_title.get_text()

				agg_title = " ".join((vol_str, chp_title))
				# print("Chp title: '{}'".format(chp_title))
				vol, chp, frag, post = extractTitle(agg_title)
				raw_item = {}
				raw_item['srcname']   = "JapTem"
				raw_item['published'] = reldate
				releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href'])
				raw_item['linkUrl']   = releaseurl

				msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra)
				msg = msgpackers.createReleasePacket(msg)

				retval.append(msg)
		if not retval:
			return []

		retval.append(meta_pkt)
		# return []
		return retval
	def extractSeriesReleases(self, seriesPageUrl, soup):

		titletg  = soup.find("h1", class_='fiction-title')
		authortg = soup.find("span", class_='author')
		ratingtg = soup.find("span", class_='overall')

		if not ratingtg:
			self.log.info("Could not find rating tag!")
			return []


		rating = float(ratingtg['score'])
		if not rating >= MIN_RATING and rating != 0.0:
			self.log.info("Item rating below upload threshold: %s", rating)
			return []

		if not titletg:
			self.log.info("Could not find title tag!")
			return []
		if not authortg:
			self.log.info("Could not find author tag!")
			return []


		title  = titletg.get_text()
		author = authortg.get_text()
		assert author.startswith("by ")
		author = author[2:].strip()


		title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)
		author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True)

		descDiv = soup.find('div', class_='description')
		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = seriesPageUrl
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'RoyalRoadL'

		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = seriesPageUrl
		extra['sourcesite']  = 'RoyalRoadL'


		chapters = soup.find("div", class_='chapters')
		releases = chapters.find_all('li', class_='chapter')

		raw_retval = []
		for release in releases:
			chp_title, reldatestr = release.find_all("span")
			rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y')
			if rel.date() == datetime.date.today():
				reldate = time.time()
			else:
				reldate = calendar.timegm(rel.timetuple())

			chp_title = chp_title.get_text()
			# print("Chp title: '{}'".format(chp_title))
			vol, chp, frag, post = extractTitle(chp_title + " " + title)

			raw_item = {}
			raw_item['srcname']   = "RoyalRoadL"
			raw_item['published'] = reldate
			raw_item['linkUrl']   = release.a['href']

			raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True)

			raw_retval.append(raw_msg)

		missing_chap = 0
		for item in raw_retval:
			if not (item['vol'] or item['chp']):
				missing_chap += 1

		if len(raw_retval):
			unnumbered = (missing_chap/len(raw_retval)) * 100
			if len(raw_retval) >= 5 and unnumbered > 80:
				self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.")
				chap = 1
				for item in raw_retval:
					item['vol'] = None
					item['chp'] = chap
					chap += 1

		# Do not add series without 3 chapters.
		if len(raw_retval) < 3:
			self.log.info("Less then three chapters!")
			return []

		if not raw_retval:
			self.log.info("Retval empty?!")
			return []

		self.amqp_put_item(meta_pkt)
		retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval]
		return retval