def process_series(self, series): expected_keys = [ 'chapters', 'cover', 'description', 'firstUpdate', 'id', 'lastUpdate', 'tags', 'title' ] if not all([tmp in series for tmp in expected_keys]): self.log.error("Missing key(s) %s from series %s. Cannot continue", [tmp for tmp in expected_keys if not tmp in series], series) return # { # 'topCover': None, # 'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short " # "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now " # "he has to return and rule the ruin that is his family's lands. Bandits roam and enemies leer. Conspiracies brew and wars rage. " # "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p> </p>\n<p>Schedule: Updates 4 times a week--> Monday-" # "Thursday.</p>\n<p> </p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>", # 'id': 19290, # 'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48), # 'topCoverAlignment': 0, # 'chapters': [{'title': 'Chapter 33', # 'fictionId': 19290, # 'date': datetime.datetime(2018, 8, 28, 1, 55, 48), # 'id': 285611}], # 'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg', # 'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore', # 'title': 'Rise of the Lord', # 'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48) # } sinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}". format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_sdata(sinfo): self.log.warning("Series data for sid %s failed validation" % series['id']) return assert int(series['id']) == int( sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % ( series['id'], sinfo['id'], type(series['id']), type(sinfo['id']), ) cinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}" .format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_cdata(cinfo): return # Order matters! If ratingCount is 0, ratingValue is None (not 0) if sinfo.get('ratingCount', 0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get( 'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT: return author = sinfo.get("authorName") if not author: self.log.error("Could not find author for series '%s'", series['id']) return if isinstance(sinfo['tags'], str): tags = sinfo['tags'].split(",") elif isinstance(sinfo['tags'], (list, tuple)): tags = list(sinfo['tags']) else: print("sinfo unknown type: ", sinfo['tags']) print("Sinfo: ", sinfo) tags = [SeriesPageCommon.fix_tag(tag) for tag in tags] description = self.extract_description(sinfo['description']) title = sinfo['title'].strip() seriesmeta = {} seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format( sid=series['id']) seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = description seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) trigger_urls = [seriesPageUrl] extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' raw_retval = [] for chapter in cinfo: reldate = chapter['date'] chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format( # sid = series['id'], cid=chapter['id'], ) chp_title = chapter['title'] # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = chap_url raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) trigger_urls.append(chap_url) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series['id'], rrl=True) self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] self.amqp_put_many(retval) self.low_priority_links_trigger(trigger_urls)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) titletg = soup.find("div", class_='fic_title') authortg = soup.find("span", class_='auth_name_fic') if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] metas = soup.find_all("script", type="application/ld+json") agg_meta = {} for meta in metas: loaded = json.loads(meta.get_text()) for k, v in loaded.items(): agg_meta[k] = v rating = float(agg_meta.get('ratingValue', "0")) rating_cnt = float(agg_meta.get('ratingCount', "0")) self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt) if rating < SeriesPageCommon.MIN_RATING_STARS: self.log.error("Item rating below upload threshold: %s", rating) return [] if rating_cnt < SeriesPageCommon.MIN_RATE_CNT: self.log.error("Item has insufficent ratings: %s", rating_cnt) return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='wi_fic_desc') if not descDiv or not descDiv.p: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] tags = [] tagdiv = soup.find('span', class_='wi_fic_showtags') for tag in tagdiv.find_all('a', class_='stag'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) # These are separate on SH, but I'm just treating them as tags. for tag in soup.find_all('li', class_='mature_contains'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) genres = [] genrediv = soup.find('span', class_='wi_fic_genre') for genre in genrediv.find_all('a', class_='fic_genre'): genretxt = SeriesPageCommon.clean_tag(genre.get_text()) genretxt = SeriesPageCommon.fix_genre(genretxt) genres.append(genretxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'ScribbleHub' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['genres'] = genres extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'ScribbleHub' self.log.info("Found %s tags, %s genres", len(tags), len(genres)) chapters = soup.find_all("li", class_='toc_w') raw_retval = [] for chapter in chapters: cname, cdate = chapter.a, chapter.span if not (cname and cdate): self.log.warning("Row with invalid number of entries?") continue if not cdate.get("title"): self.log.error("No time entry?") continue timestr = cdate.get("title").strip() itemDate, status = parsedatetime.Calendar().parse(timestr) if status < 1: self.log.warning("Failure processing date: %s", timestr) continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(cname['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "ScribbleHub" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series_id, sh=True) # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] + [meta_pkt] self.log.info("Found %s chapter releases on series page!", len(retval)) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h1") authortg = header.find("h4") authortg.find("span").decompose() rating_val = soup.find("meta", property='books:rating:value') rating_scale = soup.find("meta", property='books:rating:scale') print("Rating value:", rating_val) print("Rating scale:", rating_scale) if not rating_val or not rating_scale: return [] rval_f = float(rating_val.get('content', "0")) rscale_f = float(rating_scale.get('content', "999999")) rating = 5 * (rval_f / rscale_f) print("Float rating: ", rating) if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('span', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace( " ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url": True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap / len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning( "Item numbering force-overridden! Adding simple sequential chapter numbers." ) else: self.log.warning( "Item seems to not have numbered chapters. Adding simple sequential chapter numbers." ) chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] # self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] return retval