コード例 #1
0
    def extractSeriesReleases(self, row):

        tds = row.find_all("td")
        if len(tds) != 4:
            self.log.warning(
                "Row does not have four <td> tags! Don't know how to handle")
            pdtag = row.prettify()
            for line in pdtag.split("\n"):
                self.log.warning(line)

            return None
        title_td, ch_td, trans_td, release_td = tds

        title = title_td.find("div", class_='ellipsis-1').get_text(strip=True)

        author = trans_td.get_text(strip=True)

        if not title:
            return None
        if not author:
            return None

        # Cripes this is probably brittle
        series_type = "translated" if "," in author else "oel"

        reldate = float(release_td.span['data-timestamp'])

        chp_title = ch_td.get_text(strip=True)

        vol, chp, frag, _ = extractTitle(chp_title)

        raw_item = {}
        raw_item['srcname'] = 'FoxTeller'
        raw_item['published'] = reldate
        raw_item['linkUrl'] = urllib.parse.urljoin("https://www.foxteller.com",
                                                   ch_td.a['href'])

        raw_msg = msgpackers._buildReleaseMessage(
            raw_item=raw_item,
            series=title,
            vol=vol,
            chap=chp,
            frag=frag,
            # author      = author,
            postfix=chp_title,
            tl_type=series_type,
            # matchAuthor = True,
            # looseMatch  = True
        )

        msg = msgpackers.createReleasePacket(raw_msg)

        return msg
コード例 #2
0
    def dispatchNanoDesu(self, netloc, itemurl, itemtxt):
        itemtitle = NANO_DESU_MAP[netloc]
        vol, chp, frag, post = extractTitle(itemtxt)
        if not (vol or chp):
            return None

        raw_item = {}
        raw_item['srcname'] = "Nano Desu"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.low_priority_links_trigger([
            itemurl,
        ])

        msg = msgpackers._buildReleaseMessage(raw_item,
                                              itemtitle,
                                              vol,
                                              chp,
                                              frag,
                                              postfix=post)
        msg = msgpackers.createReleasePacket(msg)
        return msg
コード例 #3
0
    def dispatchBT(self, itemurl, itemtxt):
        titleonly = itemtxt.split("by")[0].split("bY")[0].split("By")[0].split(
            "BY")[0]
        probSeries = titleonly.lower().split("volume")[0].split(
            "chapter")[0].strip()

        vol, chp, frag, post = extractTitle(titleonly)

        raw_item = {}
        raw_item['srcname'] = "Baka-Tsuki"
        raw_item['published'] = time.time()
        raw_item['linkUrl'] = itemurl

        self.low_priority_links_trigger([
            itemurl,
        ])

        msg = msgpackers._buildReleaseMessage(raw_item,
                                              probSeries,
                                              vol,
                                              chp,
                                              frag,
                                              postfix=post)
        msg = msgpackers.createReleasePacket(msg)
コード例 #4
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)

        titletg = soup.find("div", class_='fic_title')
        authortg = soup.find("span", class_='auth_name_fic')

        if not titletg:
            self.log.error("Could not find title tag!")
            return []

        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        metas = soup.find_all("script", type="application/ld+json")
        agg_meta = {}
        for meta in metas:
            loaded = json.loads(meta.get_text())
            for k, v in loaded.items():
                agg_meta[k] = v

        rating = float(agg_meta.get('ratingValue', "0"))
        rating_cnt = float(agg_meta.get('ratingCount', "0"))

        self.log.info("Rating value: %s, Rating cnt: %s", rating, rating_cnt)

        if rating < SeriesPageCommon.MIN_RATING_STARS:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if rating_cnt < SeriesPageCommon.MIN_RATE_CNT:
            self.log.error("Item has insufficent ratings: %s", rating_cnt)
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='wi_fic_desc')
        if not descDiv or not descDiv.p:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]

        tags = []
        tagdiv = soup.find('span', class_='wi_fic_showtags')
        for tag in tagdiv.find_all('a', class_='stag'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        # These are separate on SH, but I'm just treating them as tags.
        for tag in soup.find_all('li', class_='mature_contains'):
            tagtxt = SeriesPageCommon.clean_tag(tag.get_text())
            tagtxt = SeriesPageCommon.fix_tag(tagtxt)
            tags.append(tagtxt)

        genres = []
        genrediv = soup.find('span', class_='wi_fic_genre')
        for genre in genrediv.find_all('a', class_='fic_genre'):
            genretxt = SeriesPageCommon.clean_tag(genre.get_text())
            genretxt = SeriesPageCommon.fix_genre(genretxt)
            genres.append(genretxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'ScribbleHub'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['genres'] = genres
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'ScribbleHub'

        self.log.info("Found %s tags, %s genres", len(tags), len(genres))

        chapters = soup.find_all("li", class_='toc_w')

        raw_retval = []
        for chapter in chapters:

            cname, cdate = chapter.a, chapter.span

            if not (cname and cdate):
                self.log.warning("Row with invalid number of entries?")
                continue

            if not cdate.get("title"):
                self.log.error("No time entry?")
                continue

            timestr = cdate.get("title").strip()
            itemDate, status = parsedatetime.Calendar().parse(timestr)

            if status < 1:
                self.log.warning("Failure processing date: %s", timestr)
                continue

            reldate = time.mktime(itemDate)

            relurl = common.util.urlFuncs.rebaseUrl(cname['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "ScribbleHub"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series_id,
                                                          sh=True)

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ] + [meta_pkt]

        self.log.info("Found %s chapter releases on series page!", len(retval))
        return retval
コード例 #5
0
    def extractSeriesReleases(self, seriesPageUrl, metadata, soup):

        title = metadata['title']
        author = metadata['user']['name']
        desc = metadata['description']
        tags = metadata['tags']

        # Apparently the description is rendered in a <pre> tag.
        # Huh?
        desc = markdown.markdown(desc, extensions=["mdx_linkify"])

        title = title.strip()

        # Siiiiiigh. Really?
        title = title.replace("[#wattys2015]", "")
        title = title.replace("(Wattys2015) ", "")
        title = title.replace("#Wattys2015", "")
        title = title.replace("Wattys2015", "")
        title = title.strip()

        if metadata['numParts'] < 3:
            return []
        if metadata['voteCount'] < 100:
            return []

        # Language ID 1 is english.
        if metadata['language']['id'] != 1:
            return []

        # Allow blocking of item by ID
        if metadata['id'] in BLOCK_IDS:
            return []

        # for some particularly stupid reasons, the item category tag is
        # not included in the metadata.
        # therefore, we parse it out from the page manually.
        tagdiv = soup.find("div", class_="tags")
        if tagdiv:
            for tag in tagdiv.find_all("a", class_='tag'):
                tags.append(tag.get_text())

        tags = list(
            set([
                item.lower().strip().replace("  ", " ").replace(" ", "-")
                for item in tags
            ]))

        # Mask any content with any of the blocked tags.
        if any([item in tags for item in WATTPAD_MASKED_TAGS]):
            self.log.warning(
                "Item has a masked tag. Not emitting any releases.")
            self.log.warning("Tags: '%s'", tags)
            return

        # And check that at least one of the target tags is present.
        if not any([item in tags for item in WATTPAD_REQUIRED_TAGS]):
            self.log.warning(
                "Item missing required tag. Not emitting any releases.")
            self.log.warning("Tags: '%s'", tags)
            return

        seriesmeta = {}

        extra = {}
        extra['tags'] = tags[:]
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'WattPad'

        retval = []
        index = 1
        valid = 1
        for release in metadata['parts']:
            chp_title = release['title']

            dt = datetime.datetime.strptime(release['modifyDate'],
                                            "%Y-%m-%dT%H:%M:%SZ")
            reldate = calendar.timegm(dt.timetuple())

            raw_item = {}
            raw_item['srcname'] = "WattPad"
            raw_item['published'] = reldate
            raw_item['linkUrl'] = release['url']
            msg = msgpackers._buildReleaseMessage(raw_item,
                                                  title,
                                                  None,
                                                  index,
                                                  None,
                                                  author=author,
                                                  postfix=chp_title,
                                                  tl_type='oel',
                                                  extraData=extra,
                                                  matchAuthor=True)
            retval.append(msg)

            # Check if there was substantive structure in the chapter
            # name. Used as a crude heuristic for chapter validity.
            # vol, chp, frag, post = extractTitle(chp_title)
            # if any((vol, chp, frag)):
            # 	# print("Valid: ", (vol, chp, frag))
            # 	valid += 1

            index += 1

        # if valid < (index/2):
        # 	print("Half the present chapters are have no numeric content?")
        # 	return []

        # Don't send the series metadata if we didn't find any chapters.
        if not retval:
            print("No chapters!")
            return []

        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = desc
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'WattPad'

        pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                beta=IS_BETA,
                                                matchAuthor=True)
        self.log.info("Wattpad scraper generated %s amqp messages!",
                      len(retval) + 1)
        self.amqp_put_item(pkt)
        return retval
コード例 #6
0
    def process_series(self, series):

        expected_keys = [
            'chapters', 'cover', 'description', 'firstUpdate', 'id',
            'lastUpdate', 'tags', 'title'
        ]
        if not all([tmp in series for tmp in expected_keys]):
            self.log.error("Missing key(s) %s from series %s. Cannot continue",
                           [tmp for tmp in expected_keys if not tmp in series],
                           series)
            return

        # {
        # 	'topCover': None,
        # 	'description': "<p>Gerald, born a Viscount's son, spent most of his life since he was six as an enemy Duke's 'ward', nothing short "
        #  "of a hostage. Until a shocking letter arrived requesting that he be sent back to inherit his father's territory and title.</p>\n<p>Now "
        #  "he has to return and rule the ruin that is his family's lands. Bandits roam&nbsp;and enemies leer. Conspiracies brew and wars rage. "
        #  "Meanwhile, Gerald has to rise with his house from the ashes.</p>\n<p>&nbsp;</p>\n<p>Schedule: Updates 4 times a week--&gt; Monday-"
        #  "Thursday.</p>\n<p>&nbsp;</p>\n<p>Additional tags: Kingdom Building - Strategy - War - Army Building.</p>",
        # 	'id': 19290,
        # 	'firstUpdate': datetime.datetime(2018, 7, 10, 6, 35, 48),
        # 	'topCoverAlignment': 0,
        # 	'chapters': [{'title': 'Chapter 33',
        # 	'fictionId': 19290,
        # 	'date': datetime.datetime(2018, 8, 28, 1, 55, 48),
        # 	'id': 285611}],
        # 	'cover': 'https://royalroadlupload.blob.core.windows.net/thundersurfer/rise-of-the-lord-full-AAAASg1dcgo=.jpg',
        # 	'tags': 'action,fantasy,martial_arts,male_lead,strategy,profanity,gore',
        # 	'title': 'Rise of the Lord',
        # 	'lastUpdate': datetime.datetime(2018, 8, 28, 1, 55, 48)
        #  }

        sinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/info/{sid}?apikey={key}".
            format(sid=series['id'], key=settings.RRL_API_KEY))

        if not self.validate_sdata(sinfo):
            self.log.warning("Series data for sid %s failed validation" %
                             series['id'])
            return

        assert int(series['id']) == int(
            sinfo['id']), "Mismatchin series ID: %s -> %s (%s, %s)" % (
                series['id'],
                sinfo['id'],
                type(series['id']),
                type(sinfo['id']),
            )

        cinfo = get_json(
            self.wg,
            "https://www.royalroad.com/api/fiction/chapters/{sid}?apikey={key}"
            .format(sid=series['id'], key=settings.RRL_API_KEY))
        if not self.validate_cdata(cinfo):
            return

        # Order matters! If ratingCount is 0, ratingValue is None (not 0)
        if sinfo.get('ratingCount',
                     0) > SeriesPageCommon.MIN_RATE_CNT and sinfo.get(
                         'ratingValue', 0) > SeriesPageCommon.MIN_RATING_FLOAT:
            return

        author = sinfo.get("authorName")

        if not author:
            self.log.error("Could not find author for series '%s'",
                           series['id'])
            return

        if isinstance(sinfo['tags'], str):
            tags = sinfo['tags'].split(",")
        elif isinstance(sinfo['tags'], (list, tuple)):
            tags = list(sinfo['tags'])
        else:
            print("sinfo unknown type: ", sinfo['tags'])
            print("Sinfo: ", sinfo)

        tags = [SeriesPageCommon.fix_tag(tag) for tag in tags]

        description = self.extract_description(sinfo['description'])

        title = sinfo['title'].strip()

        seriesmeta = {}

        seriesPageUrl = "https://www.royalroad.com/fiction/{sid}".format(
            sid=series['id'])

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = description
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True
        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)

        trigger_urls = [seriesPageUrl]

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        raw_retval = []
        for chapter in cinfo:

            reldate = chapter['date']
            chap_url = "https://www.royalroad.com/fiction/chapter/{cid}".format(
                # sid = series['id'],
                cid=chapter['id'], )

            chp_title = chapter['title']
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = chap_url

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            trigger_urls.append(chap_url)
            raw_retval.append(raw_msg)

        raw_retval = SeriesPageCommon.check_fix_numbering(self.log,
                                                          raw_retval,
                                                          series['id'],
                                                          rrl=True)

        self.amqp_put_item(meta_pkt)
        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        self.amqp_put_many(retval)
        self.low_priority_links_trigger(trigger_urls)
コード例 #7
0
	def extractSeriesReleases(self, seriesPageUrl, soup):
		title  = soup.find("div", class_='fanfic_title_div').get_text()
		author = soup.find("div", class_='fanfic_author_div').get_text()
		ratingtg = soup.find("div", class_='fanfic_title_wrapper')
		ratingtg = [item for item in ratingtg.contents if "Rating" in str(item)]
		if not ratingtg:
			ratingtg = ''
		else:
			ratingtg = ratingtg.pop()


		rating, views, chapters = ratingtg.split("·")

		# I think the japtem rating system is just plain out broken.
		if not "no rating" in ratingtg.lower():
			rating_score = float(rating.split()[-1])
			if not rating_score >= MIN_RATING:
				return []


		chapter_num = float(chapters.split()[0])
		if chapter_num < 3:
			return []



		if not title:
			return []
		if not author:
			return []


		descDiv = soup.find('div', class_='fanfic_synopsis')

		if not descDiv:
			print(soup)

		paras = descDiv.find_all("p")
		tags = []

		desc = []
		for para, text in [(para, para.get_text()) for para in paras]:
			if text.lower().startswith('categories:'):
				tagstr = text.split(":", 1)[-1]
				items = tagstr.split(",")
				[tags.append(item.strip()) for item in items if item.strip()]
			else:
				desc.append(para)


		seriesmeta = {}

		seriesmeta['title']       = title
		seriesmeta['author']      = author
		seriesmeta['tags']        = tags
		seriesmeta['homepage']    = ''
		seriesmeta['desc']        = " ".join([str(para) for para in desc])
		seriesmeta['tl_type']     = 'oel'
		seriesmeta['sourcesite']  = 'JapTem Fanfic'


		meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta, matchAuthor=True)

		extra = {}
		extra['tags']     = tags
		extra['homepage'] = ''
		extra['sourcesite']  = 'JapTem Fanfic'

		retval = []

		chapters = soup.find("ul", class_='fanfic_chapter_list')
		volumes = chapters.find_all('li', class_='fanfic_volume')
		for volume in volumes:
			releases = volume.find_all('li', class_='fanfic_chapter')
			for release in releases:
				chp_title = release.find("a")

				vol_str = volume.find('div', class_='fanfic_volume_title').get_text()
				reldate = time.time()

				chp_title = chp_title.get_text()

				agg_title = " ".join((vol_str, chp_title))
				vol, chp, frag, post = extractTitle(agg_title)

				raw_item = {}
				raw_item['srcname']   = 'JapTem Fanfic'
				raw_item['published'] = reldate
				releaseurl = urllib.parse.urljoin(seriesPageUrl, release.a['href'])
				raw_item['linkUrl']   = releaseurl


				raw_msg = msgpackers._buildReleaseMessage(
									raw_item    = raw_item,
									series      = title,
									vol         = vol,
									chap        = chp,
									frag        = frag,
									author      = author,
									postfix     = chp_title,
									tl_type     = 'oel',
									extraData   = extra,
									matchAuthor = True,
									looseMatch  = True
								)

				msg     = msgpackers.createReleasePacket(raw_msg)

				retval.append(msg)

		if not retval:
			return []

		self.amqp_put_item(meta_pkt)
		return retval
コード例 #8
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        # Yeah, the title text is in a div with an id of "titlePic".
        # The actual image is in a div with the /class/ titlePic
        # wat.
        titlecontainer = soup.find("div", id='titlePic')
        if not titlecontainer:
            titlecontainer = soup.find("div", id='title')
        if not titlecontainer:
            raise ValueError("No title at URL: '%s'", seriesPageUrl)

        titletg = titlecontainer.h1
        typetg, authortg, categorytg = titlecontainer.find_all("a")

        if "novel" not in typetg.get_text().lower():
            return []

        if not titletg:
            return []
        if not authortg:
            return []

        title = titletg.get_text()
        author = authortg.get_text()
        genre = categorytg.get_text()

        descDiv = soup.find('p', class_='summary')
        for item in descDiv.find_all("a"):
            item.decompose()
        desc = [
            item.strip() for item in descDiv.find_all(text=True)
            if item.strip()
        ]

        tagdiv = soup.find("div", id='cloudMain')

        tags = []
        # Skip if no tags
        if tagdiv:
            tags = [
                item.get_text().strip().lower()
                for item in tagdiv.find_all("a")
            ]

        tags.append(genre.lower())
        # Fix a lot of the stupid tag fuckups I've seen.
        # People are stupid.
        if 'science' in tags and 'fiction' in tags:
            tags.append("science-fiction")
        tags = [tag for tag in tags if tag not in BAD_TAGS]
        tags = [tag for tag in tags if len(tag) > 2]
        tags = [tag.replace("  ", " ").replace(" ", "-") for tag in tags]
        tags = list(set(tags))

        if not any([tag in BOOKSIE_REQUIRED_TAGS for tag in tags]):
            self.log.info("Missing required tags!")
            return []
        if any([tag in BOOKSIE_MASKED_TAGS for tag in tags]):
            self.log.info("Masked tag!")
            return []

        # Wrap the paragraphs in p tags.
        desc = ['<p>{text}</p>'.format(text=para) for para in desc]

        seriesmeta = {}
        seriesmeta['title'] = title
        seriesmeta['author'] = author
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\n\n ".join([str(para) for para in desc])
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'Booksie'

        pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                beta=IS_BETA,
                                                matchAuthor=True)

        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'Booksie'

        # Decompose the announcement (?) div that's cluttering up the
        # search for the chapterdiv
        badchp = soup.find("div", class_='chapters', id='noticeMessage')
        badchp.decompose()

        chapters = soup.find("div", class_='chapters')
        releases = chapters.find_all('a')

        retval = []
        for release in releases:

            # No post time, unfortunately
            chp = int(release.get_text())
            reldate = time.time()

            # Force releases to the beginning of time untill we catch up.
            reldate = 0

            vol = None
            frag = None

            raw_item = {}
            raw_item['srcname'] = "Booksie"
            raw_item['published'] = reldate
            raw_item['linkUrl'] = release['href']

            msg = msgpackers._buildReleaseMessage(raw_item,
                                                  title,
                                                  vol,
                                                  chp,
                                                  frag,
                                                  author=author,
                                                  tl_type='oel',
                                                  extraData=extra,
                                                  matchAuthor=True)
            retval.append(msg)

        if not retval:
            print("No releases?")
            return []
        self.amqp_put_item(pkt)
        return retval
コード例 #9
0
    def extractSeriesReleases(self, seriesPageUrl, soup):

        match = self.match_re.search(seriesPageUrl)
        series_id = match.group(1)
        conf = load_lut()

        assert 'force_sequential_numbering' in conf

        must_renumber = series_id in conf['force_sequential_numbering']

        # print("")
        # print("Match: ", match, match.groups(), series_id)
        # print("series_id", series_id)
        # print("Renumber:", must_renumber)

        header = soup.find("div", class_='fic-title')
        if not header:
            self.log.warning(
                "Series page %s contains no releases. Is this series removed?",
                seriesPageUrl)
            return []

        titletg = header.find("h1")
        authortg = header.find("h4")
        authortg.find("span").decompose()

        rating_val = soup.find("meta", property='books:rating:value')
        rating_scale = soup.find("meta", property='books:rating:scale')

        # print("Rating value:", rating_val)
        # print("Rating scale:", rating_scale)

        if not rating_val or not rating_scale:
            return []

        rval_f = float(rating_val.get('content', "0"))
        rscale_f = float(rating_scale.get('content', "999999"))

        rating = 5 * (rval_f / rscale_f)

        # print("Float rating: ", rating)

        if rating < MIN_RATING:
            self.log.error("Item rating below upload threshold: %s", rating)
            return []

        if not titletg:
            self.log.error("Could not find title tag!")
            return []
        if not authortg:
            self.log.error("Could not find author tag!")
            return []

        title = titletg.get_text().strip()
        author = authortg.get_text().strip()

        title = bleach.clean(title,
                             tags=[],
                             attributes=[],
                             styles=[],
                             strip=True,
                             strip_comments=True)
        author = bleach.clean(author,
                              tags=[],
                              attributes=[],
                              styles=[],
                              strip=True,
                              strip_comments=True)

        descDiv = soup.find('div', class_='description')
        if not descDiv or not descDiv.div:
            self.log.error("Incomplete or broken description?")
            return []

        desc = []
        for segment in descDiv.div:
            if isinstance(segment, bs4.NavigableString):
                desc.append(str(segment).strip())
            else:
                if segment.get_text().strip():
                    desc.append(segment.get_text().strip())

        desc = ['<p>{}</p>'.format(line) for line in desc if line.strip()]
        # print(desc)

        tags = []
        tagdiv = soup.find('span', class_='tags')
        for tag in tagdiv.find_all('span', class_='label'):
            tagtxt = tag.get_text().strip().lower().replace(" ", "-")
            # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
            if tagtxt in conf['tag_rename']:
                tagtxt = conf['tag_rename'][tagtxt]
            tags.append(tagtxt)

        info_div = soup.find("div", class_='fiction-info')
        warning_div = info_div.find("div", class_='font-red-sunglo')
        if warning_div:
            for warning_tag in warning_div.find_all('li'):
                tagtxt = warning_tag.get_text().strip().lower().replace(
                    " ", "-")
                # print("Tag: ", (tagtxt, tagtxt in conf['tag_rename']))
                if tagtxt in conf['tag_rename']:
                    tagtxt = conf['tag_rename'][tagtxt]
                tags.append(tagtxt)

        seriesmeta = {}

        seriesmeta['title'] = msgpackers.fix_string(title)
        seriesmeta['author'] = msgpackers.fix_string(author)
        seriesmeta['tags'] = tags
        seriesmeta['homepage'] = seriesPageUrl
        seriesmeta['desc'] = "\r\n".join(desc)
        seriesmeta['tl_type'] = 'oel'
        seriesmeta['sourcesite'] = 'RoyalRoadL'
        seriesmeta['create_tags'] = True

        meta_pkt = msgpackers.createSeriesInfoPacket(seriesmeta,
                                                     matchAuthor=True)
        extra = {}
        extra['tags'] = tags
        extra['homepage'] = seriesPageUrl
        extra['sourcesite'] = 'RoyalRoadL'

        chapters = soup.find_all("tr", attrs={"data-url": True})

        raw_retval = []
        for chapter in chapters:
            if len(chapter.find_all("td")) != 2:
                self.log.warning("Row with invalid number of entries?")
                continue
            cname, cdate = chapter.find_all("td")

            timestr = cdate.get_text(strip=True)
            itemDate, status = parsedatetime.Calendar().parse(timestr)

            if status < 1:
                continue

            reldate = time.mktime(itemDate)

            relurl = common.util.urlFuncs.rebaseUrl(cname.a['href'],
                                                    seriesPageUrl)

            chp_title = cname.get_text().strip()
            # print("Chp title: '{}'".format(chp_title))
            vol, chp, frag, _ = titleParsers.extractTitle(chp_title + " " +
                                                          title)

            raw_item = {}
            raw_item['srcname'] = "RoyalRoadL"
            raw_item['published'] = float(reldate)
            raw_item['linkUrl'] = relurl

            raw_msg = msgpackers._buildReleaseMessage(raw_item,
                                                      title,
                                                      vol,
                                                      chp,
                                                      frag,
                                                      author=author,
                                                      postfix=chp_title,
                                                      tl_type='oel',
                                                      extraData=extra,
                                                      matchAuthor=True)

            # print("Chapter:", raw_item)
            raw_retval.append(raw_msg)

        missing_chap = 0
        for item in raw_retval:
            if not (item['vol'] or item['chp']):
                missing_chap += 1

        if raw_retval:
            unnumbered = (missing_chap / len(raw_retval)) * 100
            if (len(raw_retval) >= 5 and unnumbered > 80) or must_renumber:
                if must_renumber:
                    self.log.warning(
                        "Item numbering force-overridden! Adding simple sequential chapter numbers."
                    )
                else:
                    self.log.warning(
                        "Item seems to not have numbered chapters. Adding simple sequential chapter numbers."
                    )
                chap = 1
                for item in raw_retval:
                    item['vol'] = None
                    item['chp'] = chap
                    chap += 1

        # Do not add series without 3 chapters.
        if len(raw_retval) < 3:
            self.log.info("Less then three chapters!")
            return []

        if not raw_retval:
            self.log.info("Retval empty?!")
            return []

        self.amqp_put_item(meta_pkt)

        retval = [
            msgpackers.createReleasePacket(raw_msg) for raw_msg in raw_retval
        ]
        return retval