def extractSeriesReleases(self, row): tds = row.find_all("td") if len(tds) != 4: self.log.warning( "Row does not have four <td> tags! Don't know how to handle") pdtag = row.prettify() for line in pdtag.split("\n"): self.log.warning(line) return None title_td, ch_td, trans_td, release_td = tds title = title_td.find("div", class_='ellipsis-1').get_text(strip=True) author = trans_td.get_text(strip=True) if not title: return None if not author: return None # Cripes this is probably brittle series_type = "translated" if "," in author else "oel" reldate = float(release_td.span['data-timestamp']) chp_title = ch_td.get_text(strip=True) vol, chp, frag, _ = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = 'FoxTeller' raw_item['published'] = reldate raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com", ch_td.a['href']) raw_msg = msgpackers._buildReleaseMessage( raw_item=raw_item, series=title, vol=vol, chap=chp, frag=frag, # author = author, postfix=chp_title, tl_type=series_type, # matchAuthor = True, # looseMatch = True ) msg = msgpackers.createReleasePacket(raw_msg) return msg
def dispatchNanoDesu(self, netloc, itemurl, itemtxt): itemtitle = NANO_DESU_MAP[netloc] vol, chp, frag, post = extractTitle(itemtxt) if not (vol or chp): return None raw_item = {} raw_item['srcname'] = "Nano Desu" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.low_priority_links_trigger([ itemurl, ]) msg = msgpackers._buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split( "BY")[0] probSeries = titleonly.lower().split("volume")[0].split( "chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.low_priority_links_trigger([ itemurl, ]) msg = msgpackers._buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) titletg = soup.find("div", class_='fic_title') authortg = soup.find("span", class_='auth_name_fic') if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] metas = soup.find_all("script", type="application/ld+json") agg_meta = {} for meta in metas: loaded = json.loads(meta.get_text()) for k, v in loaded.items(): agg_meta[k] = v rating = float(agg_meta.get('ratingValue', "0")) rating_cnt = float(agg_meta.get('ratingCount', "0")) self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt) if rating < SeriesPageCommon.MIN_RATING_STARS: self.log.error("Item rating below upload threshold: %s", rating) return [] if rating_cnt < SeriesPageCommon.MIN_RATE_CNT: self.log.error("Item has insufficent ratings: %s", rating_cnt) return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='wi_fic_desc') if not descDiv or not descDiv.p: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] tags = [] tagdiv = soup.find('span', class_='wi_fic_showtags') for tag in tagdiv.find_all('a', class_='stag'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) # These are separate on SH, but I'm just treating them as tags. for tag in soup.find_all('li', class_='mature_contains'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) genres = [] genrediv = soup.find('span', class_='wi_fic_genre') for genre in genrediv.find_all('a', class_='fic_genre'): genretxt = SeriesPageCommon.clean_tag(genre.get_text()) genretxt = SeriesPageCommon.fix_genre(genretxt) genres.append(genretxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'ScribbleHub' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['genres'] = genres extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'ScribbleHub' self.log.info("Found %s tags, %s genres", len(tags), len(genres)) chapters = soup.find_all("li", class_='toc_w') raw_retval = [] for chapter in chapters: cname, cdate = chapter.a, chapter.span if not (cname and cdate): self.log.warning("Row with invalid number of entries?") continue if not cdate.get("title"): self.log.error("No time entry?") continue timestr = cdate.get("title").strip() itemDate, status = parsedatetime.Calendar().parse(timestr) if status < 1: self.log.warning("Failure processing date: %s", timestr) continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(cname['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "ScribbleHub" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series_id, sh=True) # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] + [meta_pkt] self.log.info("Found %s chapter releases on series page!", len(retval)) return retval
def extractSeriesReleases(self, seriesPageUrl, metadata, soup): title = metadata['title'] author = metadata['user']['name'] desc = metadata['description'] tags = metadata['tags'] # Apparently the description is rendered in a <pre> tag. # Huh? desc = markdown.markdown(desc, extensions=["mdx_linkify"]) title = title.strip() # Siiiiiigh. Really? title = title.replace("[#wattys2015]", "") title = title.replace("(Wattys2015) ", "") title = title.replace("#Wattys2015", "") title = title.replace("Wattys2015", "") title = title.strip() if metadata['numParts'] < 3: return [] if metadata['voteCount'] < 100: return [] # Language ID 1 is english. if metadata['language']['id'] != 1: return [] # Allow blocking of item by ID if metadata['id'] in BLOCK_IDS: return [] # for some particularly stupid reasons, the item category tag is # not included in the metadata. # therefore, we parse it out from the page manually. tagdiv = soup.find("div", class_="tags") if tagdiv: for tag in tagdiv.find_all("a", class_='tag'): tags.append(tag.get_text()) tags = list( set([ item.lower().strip().replace(" ", " ").replace(" ", "-") for item in tags ])) # Mask any content with any of the blocked tags. if any([item in tags for item in WATTPAD_MASKED_TAGS]): self.log.warning( "Item has a masked tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return # And check that at least one of the target tags is present. if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]): self.log.warning( "Item missing required tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return seriesmeta = {} extra = {} extra['tags'] = tags[:] extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'WattPad' retval = [] index = 1 valid = 1 for release in metadata['parts']: chp_title = release['title'] dt = datetime.datetime.strptime(release['modifyDate'], "%Y-%m-%dT%H:%M:%SZ") reldate = calendar.timegm(dt.timetuple()) raw_item = {} raw_item['srcname'] = "WattPad" raw_item['published'] = reldate raw_item['linkUrl'] = release['url'] msg = msgpackers._buildReleaseMessage(raw_item, title, None, index, None, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) # Check if there was substantive structure in the chapter # name. Used as a crude heuristic for chapter validity. # vol, chp, frag, post = extractTitle(chp_title) # if any((vol, chp, frag)): # # print("Valid: ", (vol, chp, frag)) # valid += 1 index += 1 # if valid < (index/2): # print("Half the present chapters are have no numeric content?") # return [] # Don't send the series metadata if we didn't find any chapters. if not retval: print("No chapters!") return [] seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = desc seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'WattPad' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.log.info("Wattpad scraper generated %s amqp messages!", len(retval) + 1) self.amqp_put_item(pkt) return retval
def process_series(self, series): expected_keys = [ 'chapters', 'cover', 'description', 'firstUpdate', 'id', 'lastUpdate', 'tags', 'title' ] if not all([tmp in series for tmp in expected_keys]): self.log.error("Missing key(s) %s from series %s. Cannot continue", [tmp for tmp in expected_keys if not tmp in series], series) return # { # 'topCover': None, # 'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short " # "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now " # "he has to return and rule the ruin that is his family's lands. Bandits roam and enemies leer. Conspiracies brew and wars rage. " # "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p> </p>\n<p>Schedule: Updates 4 times a week--> Monday-" # "Thursday.</p>\n<p> </p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>", # 'id': 19290, # 'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48), # 'topCoverAlignment': 0, # 'chapters': [{'title': 'Chapter 33', # 'fictionId': 19290, # 'date': datetime.datetime(2018, 8, 28, 1, 55, 48), # 'id': 285611}], # 'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg', # 'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore', # 'title': 'Rise of the Lord', # 'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48) # } sinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}". format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_sdata(sinfo): self.log.warning("Series data for sid %s failed validation" % series['id']) return assert int(series['id']) == int( sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % ( series['id'], sinfo['id'], type(series['id']), type(sinfo['id']), ) cinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}" .format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_cdata(cinfo): return # Order matters! If ratingCount is 0, ratingValue is None (not 0) if sinfo.get('ratingCount', 0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get( 'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT: return author = sinfo.get("authorName") if not author: self.log.error("Could not find author for series '%s'", series['id']) return if isinstance(sinfo['tags'], str): tags = sinfo['tags'].split(",") elif isinstance(sinfo['tags'], (list, tuple)): tags = list(sinfo['tags']) else: print("sinfo unknown type: ", sinfo['tags']) print("Sinfo: ", sinfo) tags = [SeriesPageCommon.fix_tag(tag) for tag in tags] description = self.extract_description(sinfo['description']) title = sinfo['title'].strip() seriesmeta = {} seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format( sid=series['id']) seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = description seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) trigger_urls = [seriesPageUrl] extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' raw_retval = [] for chapter in cinfo: reldate = chapter['date'] chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format( # sid = series['id'], cid=chapter['id'], ) chp_title = chapter['title'] # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = chap_url raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) trigger_urls.append(chap_url) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series['id'], rrl=True) self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] self.amqp_put_many(retval) self.low_priority_links_trigger(trigger_urls)
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem Fanfic' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem Fanfic' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = 'JapTem Fanfic' raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl raw_msg = msgpackers._buildReleaseMessage( raw_item = raw_item, series = title, vol = vol, chap = chp, frag = frag, author = author, postfix = chp_title, tl_type = 'oel', extraData = extra, matchAuthor = True, looseMatch = True ) msg = msgpackers.createReleasePacket(raw_msg) retval.append(msg) if not retval: return [] self.amqp_put_item(meta_pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): # Yeah, the title text is in a div with an id of "titlePic". # The actual image is in a div with the /class/ titlePic # wat. titlecontainer = soup.find("div", id='titlePic') if not titlecontainer: titlecontainer = soup.find("div", id='title') if not titlecontainer: raise ValueError("No title at URL: '%s'", seriesPageUrl) titletg = titlecontainer.h1 typetg, authortg, categorytg = titlecontainer.find_all("a") if "novel" not in typetg.get_text().lower(): return [] if not titletg: return [] if not authortg: return [] title = titletg.get_text() author = authortg.get_text() genre = categorytg.get_text() descDiv = soup.find('p', class_='summary') for item in descDiv.find_all("a"): item.decompose() desc = [ item.strip() for item in descDiv.find_all(text=True) if item.strip() ] tagdiv = soup.find("div", id='cloudMain') tags = [] # Skip if no tags if tagdiv: tags = [ item.get_text().strip().lower() for item in tagdiv.find_all("a") ] tags.append(genre.lower()) # Fix a lot of the stupid tag fuckups I've seen. # People are stupid. if 'science' in tags and 'fiction' in tags: tags.append("science-fiction") tags = [tag for tag in tags if tag not in BAD_TAGS] tags = [tag for tag in tags if len(tag) > 2] tags = [tag.replace(" ", " ").replace(" ", "-") for tag in tags] tags = list(set(tags)) if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]): self.log.info("Missing required tags!") return [] if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]): self.log.info("Masked tag!") return [] # Wrap the paragraphs in p tags. desc = ['<p>{text}</p>'.format(text=para) for para in desc] seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\n\n ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'Booksie' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'Booksie' # Decompose the announcement (?) div that's cluttering up the # search for the chapterdiv badchp = soup.find("div", class_='chapters', id='noticeMessage') badchp.decompose() chapters = soup.find("div", class_='chapters') releases = chapters.find_all('a') retval = [] for release in releases: # No post time, unfortunately chp = int(release.get_text()) reldate = time.time() # Force releases to the beginning of time untill we catch up. reldate = 0 vol = None frag = None raw_item = {} raw_item['srcname'] = "Booksie" raw_item['published'] = reldate raw_item['linkUrl'] = release['href'] msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) if not retval: print("No releases?") return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') if not header: self.log.warning( "Series page %s contains no releases. Is this series removed?", seriesPageUrl) return [] titletg = header.find("h1") authortg = header.find("h4") authortg.find("span").decompose() rating_val = soup.find("meta", property='books:rating:value') rating_scale = soup.find("meta", property='books:rating:scale') # print("Rating value:", rating_val) # print("Rating scale:", rating_scale) if not rating_val or not rating_scale: return [] rval_f = float(rating_val.get('content', "0")) rscale_f = float(rating_scale.get('content', "999999")) rating = 5 * (rval_f / rscale_f) # print("Float rating: ", rating) if rating < MIN_RATING: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('span', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace( " ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url": True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") timestr = cdate.get_text(strip=True) itemDate, status = parsedatetime.Calendar().parse(timestr) if status < 1: continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if raw_retval: unnumbered = (missing_chap / len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning( "Item numbering force-overridden! Adding simple sequential chapter numbers." ) else: self.log.warning( "Item seems to not have numbered chapters. Adding simple sequential chapter numbers." ) chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] return retval