def extractSeriesReleases(self, seriesPageUrl, soup): chapter_divs = soup.find_all("a", class_='chapter-link') retval = [] for linka in chapter_divs: state = linka['data-preprocessor-state'] vol = linka['data-preprocessor-vol'] chp = linka['data-preprocessor-chp'] name = linka['data-preprocessor-name'] index = linka['data-preprocessor-index'] title = linka['data-preprocessor-title'] reldate = linka['data-preprocessor-reldate'] href = linka['href'] itemDate, status = parsedatetime.Calendar().parse(reldate) if status < 1: continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(linka['href'] + "/", seriesPageUrl) print([vol, chp, state, linka]) raw_item = {} raw_item['srcname'] = "Qidian" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl if state == '0': raw_msg = msgpackers.buildReleaseMessageWithType(raw_item, title, None, index, None, tl_type='translated', prefixMatch=True) retval.append(msgpackers.serialize_message(raw_msg)) elif state == "2": raw_msg = msgpackers.buildReleaseDeleteMessageWithType(raw_item, title, None, index, None, tl_type='translated', prefixMatch=True) retval.append(msgpackers.serialize_message(raw_msg)) else: print("Unknown state:", state) # Do not add series without 3 chapters. if len(retval) < 3: self.log.info("Less then three chapters!") return [] # if not retval: # self.log.info("Retval empty?!") # return [] # return [] return retval
def extractSeriesReleases(self, row): tds = row.find_all("td") if len(tds) != 4: self.log.warning( "Row does not have four <td> tags! Don't know how to handle") pdtag = row.prettify() for line in pdtag.split("\n"): self.log.warning(line) return None title_td, ch_td, trans_td, release_td = tds title = title_td.find("div", class_='ellipsis-1').get_text(strip=True) author = trans_td.get_text(strip=True) if not title: return None if not author: return None # Cripes this is probably brittle series_type = "translated" if "," in author else "oel" reldate = float(release_td.span['data-timestamp']) chp_title = ch_td.get_text(strip=True) vol, chp, frag, _ = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = 'FoxTeller' raw_item['published'] = reldate raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com", ch_td.a['href']) raw_msg = msgpackers._buildReleaseMessage( raw_item=raw_item, series=title, vol=vol, chap=chp, frag=frag, # author = author, postfix=chp_title, tl_type=series_type, # matchAuthor = True, # looseMatch = True ) msg = msgpackers.createReleasePacket(raw_msg) return msg
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split("BY")[0] probSeries = titleonly.lower().split("volume")[0].split("chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def dispatchNanoDesu(self, netloc, itemurl, itemtxt): itemtitle = NANO_DESU_MAP[netloc] vol, chp, frag, post = extractTitle(itemtxt) if not (vol or chp): return None raw_item = {} raw_item['srcname'] = "Nano Desu" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, itemtitle, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg) return msg
def sendReleases(self, releases): self.log.info( "Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=IS_BETA) self.amqp_put_item(pkt)
def dispatchBT(self, itemurl, itemtxt): titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split( "BY")[0] probSeries = titleonly.lower().split("volume")[0].split( "chapter")[0].strip() vol, chp, frag, post = extractTitle(titleonly) raw_item = {} raw_item['srcname'] = "Baka-Tsuki" raw_item['published'] = time.time() raw_item['linkUrl'] = itemurl self.put_page_link(itemurl) msg = msgpackers.buildReleaseMessage(raw_item, probSeries, vol, chp, frag, postfix=post) msg = msgpackers.createReleasePacket(msg)
def extractSeries(self, seriesPageUrl, soup): itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup) itemdata = self.extractSeriesInfo(itemsoup) # print(itemdata) tags = [] if 'genre' in itemdata and itemdata['genre']: tags = list( set([ item.lower().strip().replace(" ", " ").replace(" ", "-") for item in itemdata['genre'] ])) seriesmeta = {} seriesmeta['title'] = itemdata['title'] seriesmeta['alt_titles'] = [ itemdata['jTitle'], ] + itemdata['alt_names'] seriesmeta['author'] = itemdata['author'] seriesmeta['illust'] = itemdata['illust'] seriesmeta['desc'] = itemdata['description'] if itemdata['pubdate']: seriesmeta['pubdate'] = calendar.timegm( itemdata['pubdate'].timetuple()) else: seriesmeta['pubdate'] = None seriesmeta['pubnames'] = itemdata['pubnames'] seriesmeta['tags'] = tags seriesmeta['homepage'] = None seriesmeta['tl_type'] = 'translated' seriesmeta['sourcesite'] = 'LNDB' # pprint.pprint(itemdata) # pprint.pprint(seriesmeta) # print(seriesmeta) pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.amqp_put_item(pkt)
def extractSeries(self, seriesPageUrl, soup): itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup) itemdata = self.extractSeriesInfo(itemsoup) # print(itemdata) tags = [] if 'genre' in itemdata and itemdata['genre']: tags = list(set([item.lower().strip().replace(" ", " ").replace(" ", "-") for item in itemdata['genre']])) seriesmeta = {} seriesmeta['title'] = itemdata['title'] seriesmeta['alt_titles'] = [itemdata['jTitle'], ] + itemdata['alt_names'] seriesmeta['author'] = itemdata['author'] seriesmeta['illust'] = itemdata['illust'] seriesmeta['desc'] = itemdata['description'] if itemdata['pubdate']: seriesmeta['pubdate'] = calendar.timegm(itemdata['pubdate'].timetuple()) else: seriesmeta['pubdate'] = None seriesmeta['pubnames'] = itemdata['pubnames'] seriesmeta['tags'] = tags seriesmeta['homepage'] = None seriesmeta['tl_type'] = 'translated' seriesmeta['sourcesite'] = 'LNDB' # pprint.pprint(itemdata) # pprint.pprint(seriesmeta) # print(seriesmeta) pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.amqp_put_item(pkt)
def extractSeries(self, seriesPageUrl, soup): itemsoup = self.getSoupForSeriesItem(seriesPageUrl, soup) itemdata = self.extractSeriesInfo(itemsoup) # print(itemdata) tags = [] if "genre" in itemdata and itemdata["genre"]: tags = list(set([item.lower().strip().replace(" ", " ").replace(" ", "-") for item in itemdata["genre"]])) seriesmeta = {} seriesmeta["title"] = itemdata["title"] seriesmeta["alt_titles"] = [itemdata["jTitle"]] + itemdata["alt_names"] seriesmeta["author"] = itemdata["author"] seriesmeta["illust"] = itemdata["illust"] seriesmeta["desc"] = itemdata["description"] if itemdata["pubdate"]: seriesmeta["pubdate"] = calendar.timegm(itemdata["pubdate"].timetuple()) else: seriesmeta["pubdate"] = None seriesmeta["pubnames"] = itemdata["pubnames"] seriesmeta["tags"] = tags seriesmeta["homepage"] = None seriesmeta["tl_type"] = "translated" seriesmeta["sourcesite"] = "LNDB" # pprint.pprint(itemdata) # pprint.pprint(seriesmeta) # print(seriesmeta) pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.amqp_put_item(pkt)
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h4", class_='seriestitle') if not titletg: titletg = soup.find("div", class_='seriestitlenu') altnametg = soup.find("div", id='editassociated') descrtg = soup.find("div", id='editdescription') link_sets = { 'authortg' : soup.find("div", id='showauthors'), 'artisttg' : soup.find("div", id='showartists'), 'langtg' : soup.find("div", id='showlang'), 'genretg' : soup.find("div", id='seriesgenre'), 'tagstg' : soup.find("div", id='showtags'), 'typetg' : soup.find("div", id='showtype'), 'orig_pub_tg' : soup.find("div", id='showopublisher'), 'eng_pub_tg' : soup.find("div", id='showepublisher'), } text_sets = { 'transcompletetg' : soup.find("div", id='showtranslated'), 'yeartg' : soup.find("div", id='edityear'), 'coostatustg' : soup.find("div", id='editstatus'), 'licensedtg' : soup.find("div", id='showlicensed'), } if not titletg: self.log.warn("Could not find item title!") print(soup) return [] if not altnametg: self.log.warn("Could not find alt-name container tag!") return [] if not descrtg: self.log.warn("Could not find description container tag!") return [] data_sets = {} for key in list(link_sets.keys()): if not link_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [tag.get_text() for tag in link_sets[key].find_all("a")] for key in list(text_sets.keys()): if not text_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [tmp.strip() for tmp in text_sets[key].contents if isinstance(tmp, bs4.NavigableString)] title = titletg.get_text().strip() data_sets['title'] = title data_sets['altnames'] = [tmp.strip() for tmp in altnametg.contents if isinstance(tmp, bs4.NavigableString)] # Scrub incoming markup for key in list(data_sets.keys()): if isinstance(data_sets[key], list): data_sets[key] = [bleach.clean(val, tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() for val in data_sets[key]] else: data_sets[key] = bleach.clean(data_sets[key], tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() if data_sets['yeartg'] and data_sets['yeartg'][0]: # print("Non-null data_sets['yeartg']:", data_sets['yeartg']) tmp_d = datetime.datetime(year=int(data_sets['yeartg'].pop()), month=1, day=1) data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple()) else: data_sets['yeartg'] = None # { # 'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'], # 'orig_pub_tg': ['Media Factory'], # 'eng_pub_tg': [], # 'typetg': ['Web Novel'], # 'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'], # 'licensedtg': ['No'], # 'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'], # 'authortg': ['Iida K'], # 'artisttg': ['Shimotsuki Eito'], # 'title': 'Sendai Yuusha wa Inkyou Shitai', # 'description': '<p>\n Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>', # 'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'], # 'langtg': ['Japanese'], # 'yeartg': ['2013'] # 'transcompletetg': ['No'], # } data_sets['description'] = bleach.clean(descrtg.prettify(), tags=['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'p'], strip=True).strip() series_message = { 'update_only' : False, 'sourcesite' : "NovelUpdates", 'title' : data_sets['title'], 'alt_titles' : data_sets['altnames'] + [data_sets['title'], ], 'desc' : data_sets['description'], # 'homepage' : data_sets[''], 'author' : data_sets['authortg'], 'illust' : data_sets['artisttg'], 'pubdate' : data_sets['yeartg'], 'pubnames' : data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'], # 'sourcesite' : data_sets[''], 'tags' : data_sets['tagstg'], # AFICT, NovelUpdates doesn't have any english items, but wth. 'tl_type' : "translated" if 'English' not in data_sets['langtg'] else "oel", # New: 'coostate' : data_sets['coostatustg'], 'type' : data_sets['typetg'], 'genres' : data_sets['genretg'], 'licensed' : data_sets['licensedtg'], 'transcomplete' : data_sets['transcompletetg'], 'create_tags' : True, } # pprint.pprint(series_message) series_info_packet = msgpackers.createSeriesInfoPacket(series_message, matchAuthor=True, beta=self.is_beta) # print(series_info_packet) extra = {} extra['tags'] = data_sets['tagstg'] # extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'Unknown' chapter_tbl = soup.find("table", id='myTable') if not chapter_tbl: self.log.error("No chapter table!") return releases = chapter_tbl.find_all("tr") valid_releases = 0 for release in releases: items = release.find_all("td") if len(items) != 3: continue date_tg, group_tg, chp_tg = items rel = datetime.datetime.strptime(date_tg.get_text().strip(), '%m/%d/%y') if rel.date() == datetime.date.today(): reldate = datetime.datetime.now() else: reldate = datetime.datetime.fromtimestamp(calendar.timegm(rel.timetuple())) release_info = chp_tg.get_text().strip() group_name = group_tg.get_text().strip() group_name = msgpackers.fixSmartQuotes(group_name) upsertNuItem(self.raw_cur, { 'seriesname' : title, 'releaseinfo' : release_info, 'groupinfo' : group_name, 'referrer' : seriesPageUrl, 'outbound_wrapper' : chp_tg.a['href'], 'first_seen' : reldate, }) valid_releases += 1 self.log.info("Committing!") self.raw_cur.execute("COMMIT;") self.log.info("Committed!") # Do not add series without 3 chapters. if valid_releases < 3: self.log.warning("Less then three chapters!") return self.amqp_put_item(series_info_packet) return
def sendReleases(self, releases): self.log.info("Total releases found on page: %s", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=IS_BETA) self.amqp_put_item(pkt)
def extractSeriesReleases(self, seriesPageUrl, metadata, soup): title = metadata['title'] author = metadata['user']['name'] desc = metadata['description'] tags = metadata['tags'] # Apparently the description is rendered in a <pre> tag. # Huh? desc = markdown.markdown(desc, extensions=["mdx_linkify"]) title = title.strip() # Siiiiiigh. Really? title = title.replace("[#wattys2015]", "") title = title.replace("(Wattys2015) ", "") title = title.replace("#Wattys2015", "") title = title.replace("Wattys2015", "") title = title.strip() if metadata['numParts'] < 3: return [] if metadata['voteCount'] < 100: return [] # Language ID 1 is english. if metadata['language']['id'] != 1: return [] # Allow blocking of item by ID if metadata['id'] in BLOCK_IDS: return [] # for some particularly stupid reasons, the item category tag is # not included in the metadata. # therefore, we parse it out from the page manually. tagdiv = soup.find("div", class_="tags") if tagdiv: for tag in tagdiv.find_all("a", class_='tag'): tags.append(tag.get_text()) tags = list( set([ item.lower().strip().replace(" ", " ").replace(" ", "-") for item in tags ])) # Mask any content with any of the blocked tags. if any([item in tags for item in WATTPAD_MASKED_TAGS]): self.log.warning( "Item has a masked tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return # And check that at least one of the target tags is present. if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]): self.log.warning( "Item missing required tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return seriesmeta = {} extra = {} extra['tags'] = tags[:] extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'WattPad' retval = [] index = 1 valid = 1 for release in metadata['parts']: chp_title = release['title'] dt = datetime.datetime.strptime(release['modifyDate'], "%Y-%m-%dT%H:%M:%SZ") reldate = calendar.timegm(dt.timetuple()) raw_item = {} raw_item['srcname'] = "WattPad" raw_item['published'] = reldate raw_item['linkUrl'] = release['url'] msg = msgpackers._buildReleaseMessage(raw_item, title, None, index, None, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) # Check if there was substantive structure in the chapter # name. Used as a crude heuristic for chapter validity. # vol, chp, frag, post = extractTitle(chp_title) # if any((vol, chp, frag)): # # print("Valid: ", (vol, chp, frag)) # valid += 1 index += 1 # if valid < (index/2): # print("Half the present chapters are have no numeric content?") # return [] # Don't send the series metadata if we didn't find any chapters. if not retval: print("No chapters!") return [] seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = desc seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'WattPad' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.log.info("Wattpad scraper generated %s amqp messages!", len(retval) + 1) self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): # Yeah, the title text is in a div with an id of "titlePic". # The actual image is in a div with the /class/ titlePic # wat. titlecontainer = soup.find("div", id='titlePic') if not titlecontainer: titlecontainer = soup.find("div", id='title') if not titlecontainer: raise ValueError("No title at URL: '%s'", seriesPageUrl) titletg = titlecontainer.h1 typetg, authortg, categorytg = titlecontainer.find_all("a") if "novel" not in typetg.get_text().lower(): return [] if not titletg: return [] if not authortg: return [] title = titletg.get_text() author = authortg.get_text() genre = categorytg.get_text() descDiv = soup.find('p', class_='summary') for item in descDiv.find_all("a"): item.decompose() desc = [ item.strip() for item in descDiv.find_all(text=True) if item.strip() ] tagdiv = soup.find("div", id='cloudMain') tags = [] # Skip if no tags if tagdiv: tags = [ item.get_text().strip().lower() for item in tagdiv.find_all("a") ] tags.append(genre.lower()) # Fix a lot of the stupid tag fuckups I've seen. # People are stupid. if 'science' in tags and 'fiction' in tags: tags.append("science-fiction") tags = [tag for tag in tags if tag not in BAD_TAGS] tags = [tag for tag in tags if len(tag) > 2] tags = [tag.replace(" ", " ").replace(" ", "-") for tag in tags] tags = list(set(tags)) if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]): self.log.info("Missing required tags!") return [] if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]): self.log.info("Masked tag!") return [] # Wrap the paragraphs in p tags. desc = ['<p>{text}</p>'.format(text=para) for para in desc] seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\n\n ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'Booksie' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'Booksie' # Decompose the announcement (?) div that's cluttering up the # search for the chapterdiv badchp = soup.find("div", class_='chapters', id='noticeMessage') badchp.decompose() chapters = soup.find("div", class_='chapters') releases = chapters.find_all('a') retval = [] for release in releases: # No post time, unfortunately chp = int(release.get_text()) reldate = time.time() # Force releases to the beginning of time untill we catch up. reldate = 0 vol = None frag = None raw_item = {} raw_item['srcname'] = "Booksie" raw_item['published'] = reldate raw_item['linkUrl'] = release['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) if not retval: print("No releases?") return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h2") authortg = header.find("h4") authortg.find("span").decompose() ratingtg_type_1 = soup.find("div", class_='rating') ratingtg_type_2 = soup.find("li", text=re.compile('Overall Score')) if ratingtg_type_1: startg = ratingtg_type_1.find("span", class_='star') elif ratingtg_type_2: # print(ratingtg_type_2) starcontainer = ratingtg_type_2.find_next_sibling("li") if not starcontainer: self.log.error("Could not find rating tag (starcontainer)!") return [] startg = starcontainer.find("span", class_='star') if not startg: self.log.error("Could not find rating tag (startg)!") return [] else: self.log.error("Could not find rating tag!") return [] ratingcls = [tmp for tmp in startg['class'] if re.match(r"star\-\d+", tmp)] rating = ratingcls[0].split("-")[-1] rating = float(rating) / 10 rating = rating * 2 # Normalize to 1-10 scale # print(startg['class']) if not ratingcls: return [] if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('div', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url" : True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning("Item numbering force-overridden! Adding simple sequential chapter numbers.") else: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) conf = load_lut() assert 'force_sequential_numbering' in conf must_renumber = series_id in conf['force_sequential_numbering'] # print("") # print("Match: ", match, match.groups(), series_id) # print("series_id", series_id) # print("Renumber:", must_renumber) header = soup.find("div", class_='fic-title') titletg = header.find("h1") authortg = header.find("h4") authortg.find("span").decompose() rating_val = soup.find("meta", property='books:rating:value') rating_scale = soup.find("meta", property='books:rating:scale') print("Rating value:", rating_val) print("Rating scale:", rating_scale) if not rating_val or not rating_scale: return [] rval_f = float(rating_val.get('content', "0")) rscale_f = float(rating_scale.get('content', "999999")) rating = 5 * (rval_f / rscale_f) print("Float rating: ", rating) if not rating >= MIN_RATING and rating != 0.0: self.log.error("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') if not descDiv or not descDiv.div: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv.div: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] # print(desc) tags = [] tagdiv = soup.find('span', class_='tags') for tag in tagdiv.find_all('span', class_='label'): tagtxt = tag.get_text().strip().lower().replace(" ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) info_div = soup.find("div", class_='fiction-info') warning_div = info_div.find("div", class_='font-red-sunglo') if warning_div: for warning_tag in warning_div.find_all('li'): tagtxt = warning_tag.get_text().strip().lower().replace( " ", "-") # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename'])) if tagtxt in conf['tag_rename']: tagtxt = conf['tag_rename'][tagtxt] tags.append(tagtxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find_all("tr", attrs={"data-url": True}) raw_retval = [] for chapter in chapters: if len(chapter.find_all("td")) != 2: self.log.warning("Row with invalid number of entries?") continue cname, cdate = chapter.find_all("td") reldate = cdate.time['unixtime'] relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap / len(raw_retval)) * 100 if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber: if must_renumber: self.log.warning( "Item numbering force-overridden! Adding simple sequential chapter numbers." ) else: self.log.warning( "Item seems to not have numbered chapters. Adding simple sequential chapter numbers." ) chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] # self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: return [] if not float(ratingtg['score']) >= MIN_RATING: return [] if not titletg: return [] if not authortg: return [] if not ratingtg: return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' pkt = msgpackers.sendSeriesInfoPacket(seriesmeta) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) retval.append(msg) # Do not add series without 3 chapters. if len(retval) < 3: return [] if not retval: return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: self.log.info("Could not find rating tag!") return [] rating = float(ratingtg['score']) if not rating >= MIN_RATING and rating != 0.0: self.log.info("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.info("Could not find title tag!") return [] if not authortg: self.log.info("Could not find author tag!") return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') raw_retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) raw_retval.append(raw_msg) missing_chap = 0 for item in raw_retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(raw_retval): unnumbered = (missing_chap/len(raw_retval)) * 100 if len(raw_retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in raw_retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(meta_pkt) retval = [msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h4", class_='seriestitle') if not titletg: titletg = soup.find("div", class_='seriestitlenu') altnametg = soup.find("div", id='editassociated') descrtg = soup.find("div", id='editdescription') link_sets = { 'authortg': soup.find("div", id='showauthors'), 'artisttg': soup.find("div", id='showartists'), 'langtg': soup.find("div", id='showlang'), 'genretg': soup.find("div", id='seriesgenre'), 'tagstg': soup.find("div", id='showtags'), 'typetg': soup.find("div", id='showtype'), 'orig_pub_tg': soup.find("div", id='showopublisher'), 'eng_pub_tg': soup.find("div", id='showepublisher'), } text_sets = { 'transcompletetg': soup.find("div", id='showtranslated'), 'yeartg': soup.find("div", id='edityear'), 'coostatustg': soup.find("div", id='editstatus'), 'licensedtg': soup.find("div", id='showlicensed'), } if not titletg: self.log.warn("Could not find item title!") self.log.warn("On URL: '%s'", seriesPageUrl) self.log.warn("%s", soup) return [] if not altnametg: self.log.warn("Could not find alt-name container tag!") return [] if not descrtg: self.log.warn("Could not find description container tag!") return [] data_sets = {} for key in list(link_sets.keys()): if not link_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [ tag.get_text() for tag in link_sets[key].find_all("a") ] for key in list(text_sets.keys()): if not text_sets[key]: self.log.warn("Could not find tag for name: '%s'", key) return [] data_sets[key] = [ tmp.strip() for tmp in text_sets[key].contents if isinstance(tmp, bs4.NavigableString) ] title = titletg.get_text().strip() data_sets['title'] = title data_sets['altnames'] = [ tmp.strip() for tmp in altnametg.contents if isinstance(tmp, bs4.NavigableString) ] data_sets['altnames'] = [ tmp for tmp in data_sets['altnames'] if tmp.lower() != 'n/a' ] # Scrub incoming markup for key in list(data_sets.keys()): if isinstance(data_sets[key], list): data_sets[key] = [ bleach.clean(val, tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() for val in data_sets[key] ] else: data_sets[key] = bleach.clean(data_sets[key], tags=[], attributes=[], styles=[], strip=True, strip_comments=True).strip() if data_sets['yeartg'] and data_sets['yeartg'][0]: # print("Non-null data_sets['yeartg']:", data_sets['yeartg']) try: yearstr = data_sets['yeartg'].pop().split("-")[0] tmp_d = datetime.datetime(year=int(yearstr), month=1, day=1) data_sets['yeartg'] = calendar.timegm(tmp_d.timetuple()) except ValueError: data_sets['yeartg'] = None else: data_sets['yeartg'] = None # { # 'coostatustg': ['3 Volumes (Ongoing)', '5 Web Volumes (Ongoing)'], # 'orig_pub_tg': ['Media Factory'], # 'eng_pub_tg': [], # 'typetg': ['Web Novel'], # 'genretg': ['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fantasy', 'Romance', 'Seinen'], # 'licensedtg': ['No'], # 'altnames': ['Sendai Yuusha wa Inkyoshitai', 'The Previous Hero wants to Retire', '先代勇者は隠居したい'], # 'authortg': ['Iida K'], # 'artisttg': ['Shimotsuki Eito'], # 'title': 'Sendai Yuusha wa Inkyou Shitai', # 'description': '<p>\n Three years ago, in the land of Reinbulk, a Legendary Hero was summoned in the Kindom of Leezalion and he succeeded in repelling the Demon King. Now, five students are summoned back into Reinbulk by the Kingdom of Luxeria to fight against the Demon King and the demon army. Unlike the other heroes, Yashiro Yuu has no magical affinity and the Luxeria Kingdom has no intention on acknowledging his existence or returning him to his world.\n </p>\n <p>\n However, Yuu is actually the previous Hero that had fought the Demon King. Moreover, he is perplexed at the situation since he knows the Demon King has not returned since he sealed him. If the seal was ever broken then he would be automatically summoned instead of normal summoned. Since he already saved the world once and the Demon King hasn’t been unsealed, Yuu decides to leave the demons to the new heroes and retire from the Hero business. So he decides to become an adventurer.\n </p>', # 'tagstg': ['Elves', 'Heroes', 'Magic', 'Monsters', 'Multiple Narrators', 'Protagonist Strong from the Start', 'Strong Male Lead', 'Sword and Sorcery', 'Transported to Another World'], # 'langtg': ['Japanese'], # 'yeartg': ['2013'] # 'transcompletetg': ['No'], # } data_sets['description'] = bleach.clean(descrtg.prettify(), tags=[ 'a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li', 'ol', 'strong', 'ul', 'p' ], strip=True).strip() series_message = { 'update_only': False, 'sourcesite': "NovelUpdates", 'title': data_sets['title'], 'alt_titles': data_sets['altnames'] + [ data_sets['title'], ], 'desc': data_sets['description'], # 'homepage' : data_sets[''], 'author': data_sets['authortg'], 'illust': data_sets['artisttg'], 'pubdate': data_sets['yeartg'], 'pubnames': data_sets['orig_pub_tg'] + data_sets['eng_pub_tg'], # 'sourcesite' : data_sets[''], 'tags': data_sets['tagstg'], # AFICT, NovelUpdates doesn't have any english items, but wth. 'tl_type': "translated" if 'English' not in data_sets['langtg'] else "oel", # New: 'coostate': "<br />".join(data_sets['coostatustg']), 'type': data_sets['typetg'], 'genres': data_sets['genretg'], 'licensed': "<br />".join(data_sets['licensedtg']), 'transcomplete': "<br />".join(data_sets['transcompletetg']), 'create_tags': True, } # pprint.pprint(series_message) series_info_packet = msgpackers.createSeriesInfoPacket( series_message, matchAuthor=True, beta=self.is_beta) # print(series_info_packet) extra = {} extra['tags'] = data_sets['tagstg'] # extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'Unknown' chapter_tbl = soup.find("table", id='myTable') if not chapter_tbl: self.log.error("No chapter table!") return releases = chapter_tbl.find_all("tr") masked_classes = self.getMaskedClasses(soup) valid_releases = 0 for release in releases: items = release.find_all("td") if len(items) != 3: continue date_tg, group_tg, chp_tg = items rel = datetime.datetime.strptime(date_tg.get_text().strip(), '%m/%d/%y') if rel.date() == datetime.date.today(): reldate = datetime.datetime.now() else: reldate = datetime.datetime.fromtimestamp( calendar.timegm(rel.timetuple())) print("Release date: ", reldate) release_info = chp_tg.get_text().strip() group_name = group_tg.get_text().strip() group_name = msgpackers.fixSmartQuotes(group_name) linkas = release.find_all('a', class_='chp-release') for link in linkas: bad = any([tmp in masked_classes for tmp in link['class']]) if not bad: linkfq = link['href'] if linkfq.startswith("//"): linkfq = "https:" + linkfq if "http://" in linkfq: linkfq = linkfq.split("http://")[0] if group_name == 'Qidian International': self.log.info("Qidian item. Skipping.") elif group_name == 'Webnovel': self.log.info("Qidian item. Skipping.") else: changed = upsertNuItem( self.raw_cur, { 'seriesname': title, 'releaseinfo': release_info, 'groupinfo': group_name, 'referrer': seriesPageUrl, 'outbound_wrapper': linkfq, 'release_date': reldate, 'first_seen': datetime.datetime.now(), }) self.log.info( "Upserting outbound wrapper url %s, changed %s rows.", linkfq, changed) if changed: self.mon_con.incr('new-urls', 1) valid_releases += 1 self.log.info("Found %s releases on page!", valid_releases) self.log.info("Committing!") self.raw_cur.execute("COMMIT;") self.log.info("Committed!") # Do not add series without 3 chapters. # if valid_releases < 3: # self.log.warning("Less then three chapters!") # return self.amqp_put_item(series_info_packet) return
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = "JapTem" raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra) msg = msgpackers.createReleasePacket(msg) retval.append(msg) if not retval: return [] retval.append(meta_pkt) # return [] return retval
def extractSeriesReleases(self, seriesPageUrl, soup): # Yeah, the title text is in a div with an id of "titlePic". # The actual image is in a div with the /class/ titlePic # wat. titlecontainer = soup.find("div", id="titlePic") if not titlecontainer: titlecontainer = soup.find("div", id="title") if not titlecontainer: raise ValueError("No title at URL: '%s'", seriesPageUrl) titletg = titlecontainer.h1 typetg, authortg, categorytg = titlecontainer.find_all("a") if "novel" not in typetg.get_text().lower(): return [] if not titletg: return [] if not authortg: return [] title = titletg.get_text() author = authortg.get_text() genre = categorytg.get_text() descDiv = soup.find("p", class_="summary") for item in descDiv.find_all("a"): item.decompose() desc = [item.strip() for item in descDiv.find_all(text=True) if item.strip()] tagdiv = soup.find("div", id="cloudMain") tags = [] # Skip if no tags if tagdiv: tags = [item.get_text().strip().lower() for item in tagdiv.find_all("a")] tags.append(genre.lower()) # Fix a lot of the stupid tag fuckups I've seen. # People are stupid. if "science" in tags and "fiction" in tags: tags.append("science-fiction") tags = [tag for tag in tags if tag not in BAD_TAGS] tags = [tag for tag in tags if len(tag) > 2] tags = [tag.replace(" ", " ").replace(" ", "-") for tag in tags] tags = list(set(tags)) if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]): self.log.info("Missing required tags!") return [] if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]): self.log.info("Masked tag!") return [] # Wrap the paragraphs in p tags. desc = ["<p>{text}</p>".format(text=para) for para in desc] seriesmeta = {} seriesmeta["title"] = title seriesmeta["author"] = author seriesmeta["tags"] = tags seriesmeta["homepage"] = seriesPageUrl seriesmeta["desc"] = "\n\n ".join([str(para) for para in desc]) seriesmeta["tl_type"] = "oel" seriesmeta["sourcesite"] = "Booksie" pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) extra = {} extra["tags"] = tags extra["homepage"] = seriesPageUrl extra["sourcesite"] = "Booksie" # Decompose the announcement (?) div that's cluttering up the # search for the chapterdiv badchp = soup.find("div", class_="chapters", id="noticeMessage") badchp.decompose() chapters = soup.find("div", class_="chapters") releases = chapters.find_all("a") retval = [] for release in releases: # No post time, unfortunately chp = int(release.get_text()) reldate = time.time() # Force releases to the beginning of time untill we catch up. reldate = 0 vol = None frag = None raw_item = {} raw_item["srcname"] = "Booksie" raw_item["published"] = reldate raw_item["linkUrl"] = release["href"] msg = msgpackers.buildReleaseMessage( raw_item, title, vol, chp, frag, author=author, tl_type="oel", extraData=extra, matchAuthor=True ) retval.append(msg) if not retval: print("No releases?") return [] self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): titletg = soup.find("h1", class_='fiction-title') authortg = soup.find("span", class_='author') ratingtg = soup.find("span", class_='overall') if not ratingtg: self.log.info("Could not find rating tag!") return [] rating = float(ratingtg['score']) if not rating >= MIN_RATING and rating != 0.0: self.log.info("Item rating below upload threshold: %s", rating) return [] if not titletg: self.log.info("Could not find title tag!") return [] if not authortg: self.log.info("Could not find author tag!") return [] title = titletg.get_text() author = authortg.get_text() assert author.startswith("by ") author = author[2:].strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='description') paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' chapters = soup.find("div", class_='chapters') releases = chapters.find_all('li', class_='chapter') retval = [] for release in releases: chp_title, reldatestr = release.find_all("span") rel = datetime.datetime.strptime(reldatestr.get_text(), '%d/%m/%y') if rel.date() == datetime.date.today(): reldate = time.time() else: reldate = calendar.timegm(rel.timetuple()) chp_title = chp_title.get_text() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, post = extractTitle(chp_title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = reldate raw_item['linkUrl'] = release.a['href'] msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) missing_chap = 0 for item in retval: if not (item['vol'] or item['chp']): missing_chap += 1 if len(retval): unnumbered = (missing_chap/len(retval)) * 100 if len(retval) >= 5 and unnumbered > 80: self.log.warning("Item seems to not have numbered chapters. Adding simple sequential chapter numbers.") chap = 1 for item in retval: item['vol'] = None item['chp'] = chap chap += 1 # Do not add series without 3 chapters. if len(retval) < 3: self.log.info("Less then three chapters!") return [] if not retval: self.log.info("Retval empty?!") return [] self.amqp_put_item(pkt) return retval
def sendReleases(self, releases): self.log.info("Total releases found on page: %s. Emitting messages into AMQP local queue.", len(releases)) for release in releases: pkt = msgpackers.createReleasePacket(release, beta=self.is_beta) self.amqp_put_item(pkt)
def extractSeriesReleases(self, seriesPageUrl, soup): match = self.match_re.search(seriesPageUrl) series_id = match.group(1) titletg = soup.find("div", class_='fic_title') authortg = soup.find("span", class_='auth_name_fic') if not titletg: self.log.error("Could not find title tag!") return [] if not authortg: self.log.error("Could not find author tag!") return [] metas = soup.find_all("script", type="application/ld+json") agg_meta = {} for meta in metas: loaded = json.loads(meta.get_text()) for k, v in loaded.items(): agg_meta[k] = v rating = float(agg_meta.get('ratingValue', "0")) rating_cnt = float(agg_meta.get('ratingCount', "0")) self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt) if rating < SeriesPageCommon.MIN_RATING_STARS: self.log.error("Item rating below upload threshold: %s", rating) return [] if rating_cnt < SeriesPageCommon.MIN_RATE_CNT: self.log.error("Item has insufficent ratings: %s", rating_cnt) return [] title = titletg.get_text().strip() author = authortg.get_text().strip() title = bleach.clean(title, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) author = bleach.clean(author, tags=[], attributes=[], styles=[], strip=True, strip_comments=True) descDiv = soup.find('div', class_='wi_fic_desc') if not descDiv or not descDiv.p: self.log.error("Incomplete or broken description?") return [] desc = [] for segment in descDiv: if isinstance(segment, bs4.NavigableString): desc.append(str(segment).strip()) else: if segment.get_text().strip(): desc.append(segment.get_text().strip()) desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()] tags = [] tagdiv = soup.find('span', class_='wi_fic_showtags') for tag in tagdiv.find_all('a', class_='stag'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) # These are separate on SH, but I'm just treating them as tags. for tag in soup.find_all('li', class_='mature_contains'): tagtxt = SeriesPageCommon.clean_tag(tag.get_text()) tagtxt = SeriesPageCommon.fix_tag(tagtxt) tags.append(tagtxt) genres = [] genrediv = soup.find('span', class_='wi_fic_genre') for genre in genrediv.find_all('a', class_='fic_genre'): genretxt = SeriesPageCommon.clean_tag(genre.get_text()) genretxt = SeriesPageCommon.fix_genre(genretxt) genres.append(genretxt) seriesmeta = {} seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = "\r\n".join(desc) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'ScribbleHub' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['genres'] = genres extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'ScribbleHub' self.log.info("Found %s tags, %s genres", len(tags), len(genres)) chapters = soup.find_all("li", class_='toc_w') raw_retval = [] for chapter in chapters: cname, cdate = chapter.a, chapter.span if not (cname and cdate): self.log.warning("Row with invalid number of entries?") continue if not cdate.get("title"): self.log.error("No time entry?") continue timestr = cdate.get("title").strip() itemDate, status = parsedatetime.Calendar().parse(timestr) if status < 1: self.log.warning("Failure processing date: %s", timestr) continue reldate = time.mktime(itemDate) relurl = common.util.urlFuncs.rebaseUrl(cname['href'], seriesPageUrl) chp_title = cname.get_text().strip() # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "ScribbleHub" raw_item['published'] = float(reldate) raw_item['linkUrl'] = relurl raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) # print("Chapter:", raw_item) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series_id, sh=True) # Do not add series without 3 chapters. if len(raw_retval) < 3: self.log.info("Less then three chapters!") return [] if not raw_retval: self.log.info("Retval empty?!") return [] retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] + [meta_pkt] self.log.info("Found %s chapter releases on series page!", len(retval)) return retval
def process_series(self, series): expected_keys = [ 'chapters', 'cover', 'description', 'firstUpdate', 'id', 'lastUpdate', 'tags', 'title' ] if not all([tmp in series for tmp in expected_keys]): self.log.error("Missing key(s) %s from series %s. Cannot continue", [tmp for tmp in expected_keys if not tmp in series], series) return # { # 'topCover': None, # 'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short " # "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now " # "he has to return and rule the ruin that is his family's lands. Bandits roam and enemies leer. Conspiracies brew and wars rage. " # "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p> </p>\n<p>Schedule: Updates 4 times a week--> Monday-" # "Thursday.</p>\n<p> </p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>", # 'id': 19290, # 'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48), # 'topCoverAlignment': 0, # 'chapters': [{'title': 'Chapter 33', # 'fictionId': 19290, # 'date': datetime.datetime(2018, 8, 28, 1, 55, 48), # 'id': 285611}], # 'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg', # 'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore', # 'title': 'Rise of the Lord', # 'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48) # } sinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}". format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_sdata(sinfo): self.log.warning("Series data for sid %s failed validation" % series['id']) return assert int(series['id']) == int( sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % ( series['id'], sinfo['id'], type(series['id']), type(sinfo['id']), ) cinfo = get_json( self.wg, "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}" .format(sid=series['id'], key=settings.RRL_API_KEY)) if not self.validate_cdata(cinfo): return # Order matters! If ratingCount is 0, ratingValue is None (not 0) if sinfo.get('ratingCount', 0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get( 'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT: return author = sinfo.get("authorName") if not author: self.log.error("Could not find author for series '%s'", series['id']) return if isinstance(sinfo['tags'], str): tags = sinfo['tags'].split(",") elif isinstance(sinfo['tags'], (list, tuple)): tags = list(sinfo['tags']) else: print("sinfo unknown type: ", sinfo['tags']) print("Sinfo: ", sinfo) tags = [SeriesPageCommon.fix_tag(tag) for tag in tags] description = self.extract_description(sinfo['description']) title = sinfo['title'].strip() seriesmeta = {} seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format( sid=series['id']) seriesmeta['title'] = msgpackers.fix_string(title) seriesmeta['author'] = msgpackers.fix_string(author) seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = description seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'RoyalRoadL' seriesmeta['create_tags'] = True meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) trigger_urls = [seriesPageUrl] extra = {} extra['tags'] = tags extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'RoyalRoadL' raw_retval = [] for chapter in cinfo: reldate = chapter['date'] chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format( # sid = series['id'], cid=chapter['id'], ) chp_title = chapter['title'] # print("Chp title: '{}'".format(chp_title)) vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " + title) raw_item = {} raw_item['srcname'] = "RoyalRoadL" raw_item['published'] = float(reldate) raw_item['linkUrl'] = chap_url raw_msg = msgpackers._buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) trigger_urls.append(chap_url) raw_retval.append(raw_msg) raw_retval = SeriesPageCommon.check_fix_numbering(self.log, raw_retval, series['id'], rrl=True) self.amqp_put_item(meta_pkt) retval = [ msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval ] self.amqp_put_many(retval) self.low_priority_links_trigger(trigger_urls)
def extractSeriesReleases(self, seriesPageUrl, metadata, soup): title = metadata['title'] author = metadata['user']['name'] desc = metadata['description'] tags = metadata['tags'] # Apparently the description is rendered in a <pre> tag. # Huh? desc = markdown.markdown(desc, extensions=["linkify"]) title = title.strip() # Siiiiiigh. Really? title = title.replace("[#wattys2015]", "") title = title.replace("(Wattys2015) ", "") title = title.replace("#Wattys2015", "") title = title.replace("Wattys2015", "") title = title.strip() if metadata['numParts'] < 3: return [] if metadata['voteCount'] < 100: return [] # Language ID 1 is english. if metadata['language']['id'] != 1: return [] # Allow blocking of item by ID if metadata['id'] in BLOCK_IDS: return [] # for some particularly stupid reasons, the item category tag is # not included in the metadata. # therefore, we parse it out from the page manually. tagdiv = soup.find("div", class_="tags") if tagdiv: for tag in tagdiv.find_all("a", class_='tag'): tags.append(tag.get_text()) tags = list(set([item.lower().strip().replace(" ", " ").replace(" ", "-") for item in tags])) # Mask any content with any of the blocked tags. if any([item in tags for item in WATTPAD_MASKED_TAGS]): self.log.warning("Item has a masked tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return # And check that at least one of the target tags is present. if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]): self.log.warning("Item missing required tag. Not emitting any releases.") self.log.warning("Tags: '%s'", tags) return seriesmeta = {} extra = {} extra['tags'] = tags[:] extra['homepage'] = seriesPageUrl extra['sourcesite'] = 'WattPad' retval = [] index = 1 valid = 1 for release in metadata['parts']: chp_title = release['title'] dt = datetime.datetime.strptime(release['modifyDate'], "%Y-%m-%dT%H:%M:%SZ" ) reldate = calendar.timegm(dt.timetuple()) raw_item = {} raw_item['srcname'] = "WattPad" raw_item['published'] = reldate raw_item['linkUrl'] = release['url'] msg = msgpackers.buildReleaseMessage(raw_item, title, None, index, None, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) retval.append(msg) # Check if there was substantive structure in the chapter # name. Used as a crude heuristic for chapter validity. # vol, chp, frag, post = extractTitle(chp_title) # if any((vol, chp, frag)): # # print("Valid: ", (vol, chp, frag)) # valid += 1 index += 1 # if valid < (index/2): # print("Half the present chapters are have no numeric content?") # return [] # Don't send the series metadata if we didn't find any chapters. if not retval: print("No chapters!") return [] seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = seriesPageUrl seriesmeta['desc'] = desc seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'WattPad' pkt = msgpackers.createSeriesInfoPacket(seriesmeta, beta=IS_BETA, matchAuthor=True) self.log.info("Wattpad scraper generated %s amqp messages!", len(retval) + 1) self.amqp_put_item(pkt) return retval
def extractSeriesReleases(self, seriesPageUrl, soup): title = soup.find("div", class_='fanfic_title_div').get_text() author = soup.find("div", class_='fanfic_author_div').get_text() ratingtg = soup.find("div", class_='fanfic_title_wrapper') ratingtg = [ item for item in ratingtg.contents if "Rating" in str(item) ] if not ratingtg: ratingtg = '' else: ratingtg = ratingtg.pop() rating, views, chapters = ratingtg.split("·") # I think the japtem rating system is just plain out broken. if not "no rating" in ratingtg.lower(): rating_score = float(rating.split()[-1]) if not rating_score >= MIN_RATING: return [] chapter_num = float(chapters.split()[0]) if chapter_num < 3: return [] if not title: return [] if not author: return [] descDiv = soup.find('div', class_='fanfic_synopsis') if not descDiv: print(soup) paras = descDiv.find_all("p") tags = [] desc = [] for para, text in [(para, para.get_text()) for para in paras]: if text.lower().startswith('categories:'): tagstr = text.split(":", 1)[-1] items = tagstr.split(",") [tags.append(item.strip()) for item in items if item.strip()] else: desc.append(para) seriesmeta = {} seriesmeta['title'] = title seriesmeta['author'] = author seriesmeta['tags'] = tags seriesmeta['homepage'] = '' seriesmeta['desc'] = " ".join([str(para) for para in desc]) seriesmeta['tl_type'] = 'oel' seriesmeta['sourcesite'] = 'JapTem Fanfic' meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True) extra = {} extra['tags'] = tags extra['homepage'] = '' extra['sourcesite'] = 'JapTem Fanfic' retval = [] chapters = soup.find("ul", class_='fanfic_chapter_list') volumes = chapters.find_all('li', class_='fanfic_volume') for volume in volumes: releases = volume.find_all('li', class_='fanfic_chapter') for release in releases: chp_title = release.find("a") vol_str = volume.find('div', class_='fanfic_volume_title').get_text() reldate = time.time() chp_title = chp_title.get_text() agg_title = " ".join((vol_str, chp_title)) vol, chp, frag, post = extractTitle(agg_title) raw_item = {} raw_item['srcname'] = 'JapTem Fanfic' raw_item['published'] = reldate releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href']) raw_item['linkUrl'] = releaseurl raw_msg = msgpackers.buildReleaseMessage(raw_item, title, vol, chp, frag, author=author, postfix=chp_title, tl_type='oel', extraData=extra, matchAuthor=True) msg = msgpackers.createReleasePacket(raw_msg) retval.append(msg) if not retval: return [] self.amqp_put_item(meta_pkt) return retval