def process_series(self, series):

        expected_keys = [
            'chapters', 'cover', 'description', 'firstUpdate', 'id',
            'lastUpdate', 'tags', 'title'
        ]
        if not all([tmp in series for tmp in expected_keys]):
            self.log.error("Missing key(s) %s from series %s. Cannot continue",
                           [tmp for tmp in expected_keys if not tmp in series],
                           series)
            return

        # {
        # 	'topCover': None,
        # 	'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short "
        #  "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now "
        #  "he has to return and rule the ruin that is his family's lands. Bandits roam&nbsp;and enemies leer. Conspiracies brew and wars rage. "
        #  "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p>&nbsp;</p>\n<p>Schedule: Updates 4 times a week--&gt; Monday-"
        #  "Thursday.</p>\n<p>&nbsp;</p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>",
        # 	'id': 19290,
        # 	'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48),
        # 	'topCoverAlignment': 0,
        # 	'chapters': [{'title': 'Chapter 33',
        # 	'fictionId': 19290,
        # 	'date': datetime.datetime(2018, 8, 28, 1, 55, 48),
        # 	'id': 285611}],
        # 	'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg',
        # 	'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore',
        # 	'title': 'Rise of the Lord',
        # 	'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48)
        #  }

        sinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}".
            format(sid=series['id'], key=settings.RRL_API_KEY))

        if not self.validate_sdata(sinfo):
            self.log.warning("Series data for sid %s failed validation" %
                             series['id'])
            return

        assert int(series['id']) == int(
            sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % (
                series['id'],
                sinfo['id'],
                type(series['id']),
                type(sinfo['id']),
            )

        cinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}"
            .format(sid=series['id'], key=settings.RRL_API_KEY))
        if not self.validate_cdata(cinfo):
            return

        # Order matters! If ratingCount is 0, ratingValue is None (not 0)
        if sinfo.get('ratingCount',
                     0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get(
                         'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT:
            return

        author = sinfo.get("authorName")

        if not author:
            self.log.error("Could not find author for series '%s'",
                           series['id'])
            return

        if isinstance(sinfo['tags'], str):
            tags = sinfo['tags'].split(",")
        elif isinstance(sinfo['tags'], (list, tuple)):
            tags = list(sinfo['tags'])
        else:
            print("sinfo unknown type: ", sinfo['tags'])
            print("Sinfo: ", sinfo)

        tags = [SeriesPageCommon.fix_tag(tag) for tag in tags]

        description = self.extract_description(sinfo['description'])

        title = sinfo['title'].strip()

        seriesmeta = {}

        seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format(
            sid=series['id'])

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = description
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True
        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        trigger_urls = [seriesPageUrl]

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        raw_retval = []
        for chapter in cinfo:

            reldate = chapter['date']
            chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format(
                # sid = series['id'],
                cid=chapter['id'], )

            chp_title = chapter['title']
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = chap_url

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            trigger_urls.append(chap_url)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series['id'],
                                                          rrl=True)

        self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        self.amqp_put_many(retval)
        self.low_priority_links_trigger(trigger_urls)
Ejemplo n.º 2
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)

        titletg = soup.find("div", class_='fic_title')
        authortg = soup.find("span", class_='auth_name_fic')

        if not titletg:
            self.log.error("Could not find title tag!")
            return []

        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        metas = soup.find_all("script", type="application/ld+json")
        agg_meta = {}
        for meta in metas:
            loaded = json.loads(meta.get_text())
            for k, v in loaded.items():
                agg_meta[k] = v

        rating = float(agg_meta.get('ratingValue', "0"))
        rating_cnt = float(agg_meta.get('ratingCount', "0"))

        self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt)

        if rating < SeriesPageCommon.MIN_RATING_STARS:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if rating_cnt < SeriesPageCommon.MIN_RATE_CNT:
            self.log.error("Item has insufficent ratings: %s", rating_cnt)
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='wi_fic_desc')
        if not descDiv or not descDiv.p:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]

        tags = []
        tagdiv = soup.find('span', class_='wi_fic_showtags')
        for tag in tagdiv.find_all('a', class_='stag'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        # These are separate on SH, but I'm just treating them as tags.
        for tag in soup.find_all('li', class_='mature_contains'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        genres = []
        genrediv = soup.find('span', class_='wi_fic_genre')
        for genre in genrediv.find_all('a', class_='fic_genre'):
            genretxt = SeriesPageCommon.clean_tag(genre.get_text())
            genretxt = SeriesPageCommon.fix_genre(genretxt)
            genres.append(genretxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'ScribbleHub'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['genres'] = genres
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'ScribbleHub'

        self.log.info("Found %s tags, %s genres", len(tags), len(genres))

        chapters = soup.find_all("li", class_='toc_w')

        raw_retval = []
        for chapter in chapters:

            cname, cdate = chapter.a, chapter.span

            if not (cname and cdate):
                self.log.warning("Row with invalid number of entries?")
                continue

            if not cdate.get("title"):
                self.log.error("No time entry?")
                continue

            timestr = cdate.get("title").strip()
            itemDate, status = parsedatetime.Calendar().parse(timestr)

            if status < 1:
                self.log.warning("Failure processing date: %s", timestr)
                continue

            reldate = time.mktime(itemDate)

            relurl = common.util.urlFuncs.rebaseUrl(cname['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "ScribbleHub"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series_id,
                                                          sh=True)

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ] + [meta_pkt]

        self.log.info("Found %s chapter releases on series page!", len(retval))
        return retval
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)
        conf = load_lut()

        assert 'force_sequential_numbering' in conf

        must_renumber = series_id in conf['force_sequential_numbering']

        # print("")
        # print("Match: ", match, match.groups(), series_id)
        # print("series_id", series_id)
        # print("Renumber:", must_renumber)

        header = soup.find("div", class_='fic-title')
        titletg = header.find("h1")
        authortg = header.find("h4")
        authortg.find("span").decompose()

        rating_val = soup.find("meta", property='books:rating:value')
        rating_scale = soup.find("meta", property='books:rating:scale')

        print("Rating value:", rating_val)
        print("Rating scale:", rating_scale)

        if not rating_val or not rating_scale:
            return []

        rval_f = float(rating_val.get('content', "0"))
        rscale_f = float(rating_scale.get('content', "999999"))

        rating = 5 * (rval_f / rscale_f)

        print("Float rating: ", rating)

        if not rating >= MIN_RATING and rating != 0.0:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if not titletg:
            self.log.error("Could not find title tag!")
            return []
        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='description')
        if not descDiv or not descDiv.div:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv.div:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
        # print(desc)

        tags = []
        tagdiv = soup.find('span', class_='tags')
        for tag in tagdiv.find_all('span', class_='label'):
            tagtxt = tag.get_text().strip().lower().replace(" ", "-")
            # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
            if tagtxt in conf['tag_rename']:
                tagtxt = conf['tag_rename'][tagtxt]
            tags.append(tagtxt)

        info_div = soup.find("div", class_='fiction-info')
        warning_div = info_div.find("div", class_='font-red-sunglo')
        if warning_div:
            for warning_tag in warning_div.find_all('li'):
                tagtxt = warning_tag.get_text().strip().lower().replace(
                    " ", "-")
                # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
                if tagtxt in conf['tag_rename']:
                    tagtxt = conf['tag_rename'][tagtxt]
                tags.append(tagtxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        chapters = soup.find_all("tr", attrs={"data-url": True})

        raw_retval = []
        for chapter in chapters:
            if len(chapter.find_all("td")) != 2:
                self.log.warning("Row with invalid number of entries?")
                continue
            cname, cdate = chapter.find_all("td")

            reldate = cdate.time['unixtime']
            relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, post = extractTitle(chp_title + " " + title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers.buildReleaseMessage(raw_item,
                                                     title,
                                                     vol,
                                                     chp,
                                                     frag,
                                                     author=author,
                                                     postfix=chp_title,
                                                     tl_type='oel',
                                                     extraData=extra,
                                                     matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        missing_chap = 0
        for item in raw_retval:
            if not (item['vol'] or item['chp']):
                missing_chap += 1

        if len(raw_retval):
            unnumbered = (missing_chap / len(raw_retval)) * 100
            if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
                if must_renumber:
                    self.log.warning(
                        "Item numbering force-overridden! Adding simple sequential chapter numbers."
                    )
                else:
                    self.log.warning(
                        "Item seems to not have numbered chapters. Adding simple sequential chapter numbers."
                    )
                chap = 1
                for item in raw_retval:
                    item['vol'] = None
                    item['chp'] = chap
                    chap += 1

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        # self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        return retval