def sendReleases(self, releases): self.log.info( "Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=IS_BETA) self.amqp_put_item(pkt)
def extractSeriesReleases(self, row): tds = row.find_all("td") if len(tds) != 4: self.log.warning( "Row does not have four <td> tags! Don't know how to handle") pdtag = row.prettify() for line in pdtag.split("\n"): self.log.warning(line) return None title_td, ch_td, trans_td, release_td = tds title = title_td.find("div", class_='ellipsis-1').get_text(strip=True) author = trans_td.get_text(strip=True) if not title: return None if not author: return None # Cripes this is probably brittle series_type = "translated" if "," in author else "oel" reldate = float(release_td.span['data-timestamp']) chp_title = ch_td.get_text(strip=True) vol, chp, frag, _ = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = 'FoxTeller' raw_item['published'] = reldate raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com", ch_td.a['href']) raw_msg = msgpackers._buildReleaseMessage( raw_item=raw_item, series=title, vol=vol, chap=chp, frag=frag, # author = author, postfix=chp_title, tl_type=series_type, # matchAuthor = True, # looseMatch = True ) msg = msgpackers.createReleasePacket(raw_msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0] probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def dispatchNanoDesu(self, netloc, itemurl, itemtxt): itemtitle = NANO_DESU_MAP[netloc] vol, chp, frag, post = extractTitle(itemtxt) if not (vol or chp): return None raw_item = {} raw_item['srcname'] = "Nano Desu" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split( "BY")[0] probSeries = titleonly.lower().split("volume")[0].split( "chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) titletg = soup.find("div", class_='fic_title') authortg = soup.find("span", class_='auth_name_fic') if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] metas = soup.find_all("script", type="application/ld+json") agg_meta = {} for meta in metas: loaded = json.loads(meta.get_text()) for k, v in loaded.items(): agg_meta[k] = v rating = float(agg_meta.get('ratingValue', "0")) rating_cnt = float(agg_meta.get('ratingCount', "0")) self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt) if rating < SeriesPageCommon.MIN_RATING_STARS: self.log.error("Item rating below upload threshold: %s", rating) return [] if rating_cnt < SeriesPageCommon.MIN_RATE_CNT: self.log.error("Item has insufficent ratings: %s", rating_cnt) return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='wi_fic_desc') if not descDiv or not descDiv.p: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] tags = [] tagdiv = soup.find('span', class_='wi_fic_showtags') for tag in tagdiv.find_all('a', class_='stag'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) # These are separate on SH, but I'm just treating them as tags. for tag in soup.find_all('li', class_='mature_contains'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) genres = [] genrediv = soup.find('span', class_='wi_fic_genre') for genre in genrediv.find_all('a', class_='fic_genre'): genretxt = SeriesPageCommon.clean_tag(genre.get_text()) genretxt = SeriesPageCommon.fix_genre(genretxt) genres.append(genretxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'ScribbleHub' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['genres'] = genres extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'ScribbleHub' self.log.info("Found %s tags, %s genres", len(tags), len(genres)) chapters = soup.find_all("li", class_='toc_w') raw_retval = [] for chapter in chapters: cname, cdate = chapter.a, chapter.span if not (cname and cdate): self.log.warning("Row with invalid number of entries?") continue if not cdate.get("title"): self.log.error("No time entry?") continue timestr = cdate.get("title").strip() itemDate, status = parsedatetime.Calendar().parse(timestr) if status < 1: self.log.warning("Failure processing date: %s", timestr) continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(cname['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "ScribbleHub" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series_id, sh=True) # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] + [meta_pkt] self.log.info("Found %s chapter releases on series page!", len(retval)) return retval
def sendReleases(self, releases): self.log.info("Total releases found on page: %s", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=IS_BETA) self.amqp_put_item(pkt)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h2") authortg = header.find("h4") authortg.find("span").decompose() ratingtg_type_1 = soup.find("div", class_='rating') ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score')) if ratingtg_type_1: startg = ratingtg_type_1.find("span", class_='star') elif ratingtg_type_2: # print(ratingtg_type_2) starcontainer = ratingtg_type_2.find_next_sibling("li") if not starcontainer: self.log.error("Could not find rating tag (starcontainer)!") return [] startg = starcontainer.find("span", class_='star') if not startg: self.log.error("Could not find rating tag (startg)!") return [] else: self.log.error("Could not find rating tag!") return [] ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)] rating = ratingcls[0].split("-")[-1] rating = float(rating) / 10 rating = rating * 2 # Normalize to 1-10 scale # print(startg['class']) if not ratingcls: return [] if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('div', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url" : True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.") else: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval
def process_series(self, series): expected_keys = [ 'chapters', 'cover', 'description', 'firstUpdate', 'id', 'lastUpdate', 'tags', 'title' ] if not all([tmp in series for tmp in expected_keys]): self.log.error("Missing key(s) %s from series %s. Cannot continue", [tmp for tmp in expected_keys if not tmp in series], series) return # { # 'topCover': None, # 'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short " # "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now " # "he has to return and rule the ruin that is his family's lands. Bandits roam and enemies leer. Conspiracies brew and wars rage. " # "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p> </p>\n<p>Schedule: Updates 4 times a week--> Monday-" # "Thursday.</p>\n<p> </p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>", # 'id': 19290, # 'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48), # 'topCoverAlignment': 0, # 'chapters': [{'title': 'Chapter 33', # 'fictionId': 19290, # 'date': datetime.datetime(2018, 8, 28, 1, 55, 48), # 'id': 285611}], # 'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg', # 'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore', # 'title': 'Rise of the Lord', # 'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48) # } sinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}". format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_sdata(sinfo): self.log.warning("Series data for sid %s failed validation" % series['id']) return assert int(series['id']) == int( sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % ( series['id'], sinfo['id'], type(series['id']), type(sinfo['id']), ) cinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}" .format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_cdata(cinfo): return # Order matters! If ratingCount is 0, ratingValue is None (not 0) if sinfo.get('ratingCount', 0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get( 'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT: return author = sinfo.get("authorName") if not author: self.log.error("Could not find author for series '%s'", series['id']) return if isinstance(sinfo['tags'], str): tags = sinfo['tags'].split(",") elif isinstance(sinfo['tags'], (list, tuple)): tags = list(sinfo['tags']) else: print("sinfo unknown type: ", sinfo['tags']) print("Sinfo: ", sinfo) tags = [SeriesPageCommon.fix_tag(tag) for tag in tags] description = self.extract_description(sinfo['description']) title = sinfo['title'].strip() seriesmeta = {} seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format( sid=series['id']) seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = description seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) trigger_urls = [seriesPageUrl] extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' raw_retval = [] for chapter in cinfo: reldate = chapter['date'] chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format( # sid = series['id'], cid=chapter['id'], ) chp_title = chapter['title'] # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = chap_url raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) trigger_urls.append(chap_url) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series['id'], rrl=True) self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] self.amqp_put_many(retval) self.low_priority_links_trigger(trigger_urls)
def sendReleases(self, releases): self.log.info("Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=self.is_beta) self.amqp_put_item(pkt)
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [ item for item in ratingtg.contents if "Rating" in str(item) ] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem Fanfic' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem Fanfic' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = 'JapTem Fanfic' raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) msg = msgpackers.createReleasePacket(raw_msg) retval.append(msg) if not retval: return [] self.amqp_put_item(meta_pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h1") authortg = header.find("h4") authortg.find("span").decompose() rating_val = soup.find("meta", property='books:rating:value') rating_scale = soup.find("meta", property='books:rating:scale') print("Rating value:", rating_val) print("Rating scale:", rating_scale) if not rating_val or not rating_scale: return [] rval_f = float(rating_val.get('content', "0")) rscale_f = float(rating_scale.get('content', "999999")) rating = 5 * (rval_f / rscale_f) print("Float rating: ", rating) if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('span', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace( " ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url": True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap / len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning( "Item numbering force-overridden! Adding simple sequential chapter numbers." ) else: self.log.warning( "Item seems to not have numbered chapters. Adding simple sequential chapter numbers." ) chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] # self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = "JapTem" raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) msg = msgpackers.createReleasePacket(msg) retval.append(msg) if not retval: return [] retval.append(meta_pkt) # return [] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: self.log.info("Could not find rating tag!") return [] rating = float(ratingtg['score']) if not rating >= MIN_RATING and rating != 0.0: self.log.info("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.info("Could not find title tag!") return [] if not authortg: self.log.info("Could not find author tag!") return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') raw_retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if len(raw_retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval