Example #1
0
	def getCurrentInfo(self, fic: Fic) -> Fic:
		url = self.constructUrl(fic.localId)
		# scrape fresh info
		data = scrape.scrape(url)

		edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'sugarquill_ec')
		return self.parseInfoInto(fic, data['raw'])
    def getCurrentInfo(self, fic: Fic) -> Fic:
        url = self.constructUrl(fic.localId)
        # scrape fresh info
        data = scrape.scrape(url)
        time.sleep(self.baseDelay)

        edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'hpffa_ec')
        return self.parseInfoInto(fic, data['raw'])
Example #3
0
	def extractContent(self, fic: Fic, html: str) -> str:
		from bs4 import BeautifulSoup  # type: ignore
		soup = BeautifulSoup(html, 'html.parser')
		normalDiv = soup.find('div', {'name': 'Normal'})
		if normalDiv is None:
			edumpContent(html, 'fa_ec')
			raise Exception('unable to find normalDiv, e-dumped')

		return str(normalDiv)
Example #4
0
    def getCurrentInfo(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId)
        url = self.tocUrl
        data = scrape.scrape(url)
        edumpContent('<!-- {} -->\n{}'.format(url, data['raw']),
                     'wavesarisen_ec')

        fic = self.parseInfoInto(fic, data['raw'])
        fic.upsert()
        return Fic.lookup((fic.id, ))
Example #5
0
    def extractContent(self, fic: Fic, html: str) -> str:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(html, 'html5lib')
        tables = soup.findAll('table', {'width': '100%'})
        if len(tables) != 5:
            edumpContent(html, 'aff')
            raise Exception('table count mismatch: {}'.format(len(tables)))
        ficTable = tables[2]
        trs = ficTable.findAll('tr')

        return str(trs[5])
Example #6
0
    def getRealAuthorPost(self, fic: 'Fic') -> Any:
        from bs4 import BeautifulSoup
        url = self.baseUrl + 'threads/' + str(fic.localId)
        data = self.scrapeLike(url)

        soup = BeautifulSoup(data, 'html5lib')

        posts = soup.find_all(self.postContainer, {'class': 'message'})
        if len(posts) < 1:
            edumpContent(data, 'xen')
            raise Exception(f'error: unable to find author from {url}')
        return posts[0]
Example #7
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.constructUrl(fic.localId)

		# scrape fresh info
		data = scrape.scrape(fic.url)

		edumpContent(data['raw'], 'sugarquill')

		fic = self.parseInfoInto(fic, data['raw'])
		fic.upsert()

		return Fic.lookup((fic.id, ))
Example #8
0
	def extractContent(self, fic: Fic, html: str) -> str:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(html, 'html.parser')
		chapters = soup.find(id='chapters')
		if chapters is None:
			edumpContent(html, 'ao3_ec')
			raise Exception('unable to find chapters, e-dumped')
		# delete 'Notes' and 'Chapter Text' headings
		for heading in chapters.find_all('h3', {'class': 'heading'}):
			heading.extract()

		return str(chapters)
    def create(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId)

        # scrape fresh info
        data = scrape.scrape(fic.url)
        time.sleep(self.baseDelay)

        edumpContent(data['raw'], 'hpffa')

        fic = self.parseInfoInto(fic, data['raw'])
        fic.upsert()

        return Fic.lookup((fic.id, ))
Example #10
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.baseUrl + str(fic.localId)

		# scrape fresh info
		url = fic.url.split('?')[0] + '?view_adult=true'
		data = scrape.scrape(url)

		edumpContent(data['raw'], 'ao3')

		fic = self.parseInfoInto(fic, data['raw'])
		fic.upsert()

		chapter = fic.chapter(1)
		chapter.setHtml(data['raw'])
		chapter.upsert()

		return Fic.lookup((fic.id, ))
    def extractContent(self, fic: Fic, html: str) -> str:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(html, 'html.parser')
        mainpage = soup.find(id='mainpage')
        if mainpage is None:
            edumpContent(html, 'hpffa_ec')
            raise Exception('unable to find mainpage, e-dumped')

        blocks = mainpage.findAll('div', {'class': 'block'})
        for block in blocks:
            title = block.find('div', {'class': 'title'})
            if title is not None and title.contents[0] == 'Story':
                content = block.find('div', {'class': 'content'})
                if content is not None:
                    return str(content)

        edumpContent(html, 'hpffa_ec')
        raise Exception('unable to find content, e-dumped')
Example #12
0
    def getPostUpdatedOrPublished(self, post: Any) -> int:
        # old style xen foro
        messageMeta = post.find_all('div', {'class': 'messageMeta'})
        if len(messageMeta) == 1:
            dt = messageMeta[0].find_all('span', {'class': 'DateTime'})
            ts = None
            if len(dt) == 1:
                dt = dt[0]
                ts = dt.get('title')
            else:
                dt = messageMeta[0].find_all('abbr', {'class': 'DateTime'})
                if len(dt) != 1:
                    raise Exception(
                        'error: unable to find message meta datetime')
                dt = dt[0]
                ts = dt.get_text()

            tsp = dateutil.parser.parse(ts)
            uts = util.dtToUnix(tsp)
            return uts

        if len(messageMeta) > 1:
            raise Exception('error: unable to find message meta')

        # new xen foro style
        lastEdit = post.find('div', {'class': 'message-lastEdit'})
        if lastEdit is not None:
            t = lastEdit.find('time')
            return int(t.get('data-time'))

        postPublish = post.find('div', {'class': 'message-attribution-main'})
        if postPublish is not None:
            t = postPublish.find('time')
            return int(t.get('data-time'))

        postPublish = post.find('header', {'class': 'message-attribution'})
        if postPublish is not None:
            t = postPublish.find('time')
            return int(t.get('data-time'))

        edumpContent(str(post), 'xen_post' + util.randomString())
        raise Exception('unable to find post update or publish ts')
Example #13
0
	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		html = html.replace('\r\n', '\n')
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		infoPane = soup.findAll('td', {'class': 'info2_pane'})
		if len(infoPane) != 1:
			raise Exception('unable to find info2_pane: {}'.format(fic.url))
		infoPane = infoPane[0]

		authorHrefPrefix = 'index.php?action=profile&id='
		authorLinks = infoPane.findAll('a')
		authorUrl = None
		for authorLink in authorLinks:
			if not authorLink.get('href').startswith(authorHrefPrefix):
				continue

			authorUrl = self.baseUrl + '/' + authorLink.get('href')
			author = authorLink.getText()
			authorLocalId = authorLink.get('href')[len(authorHrefPrefix):]

			self.setAuthor(fic, author, authorUrl, authorLocalId)
			break
		else:
			raise Exception('unable to find author: {}'.format(fic.url))

		titleMatch = re.search(
			'<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE
		)
		if titleMatch is None:
			edumpContent(str(infoPane), 'sugarquill_title')
			raise Exception('could not locate title')

		fic.title = titleMatch.group(1).replace('&nbsp;', ' ').strip()

		chapterOptions = infoPane.findAll('option')
		chapterTitles = {}
		for chapterOption in chapterOptions:
			cid = int(chapterOption.get('value'))
			chapterTitles[cid] = chapterOption.getText().strip()
		fic.chapterCount = len(chapterOptions)

		fic.ageRating = '<unkown>'  # TODO
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ficStatus = FicStatus.ongoing  # TODO: no uniform way to detect?

		authorProfileHtml = scrape.scrape(authorUrl)['raw']
		authorProfileHtml = authorProfileHtml.replace('\r', '')
		authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib')

		storyTables = authorSoup.findAll('table', {'width': '90%'})
		ourStoryTable = None
		for storyTable in storyTables:
			storyId = None
			for a in storyTable.findAll('a'):
				if not a.get('href').startswith('read.php?storyid='):
					continue
				storyId = a.get('href')[len('read.php?storyid='):]
				storyId = storyId[:storyId.find('&')]
				storyId = str(int(storyId))
			if storyId is None:
				continue
			if storyId != str(fic.localId):
				continue
			ourStoryTable = storyTable
		if ourStoryTable is None:
			raise Exception(f'unable to find story table: {fic.localId} {authorUrl}')

		trs = ourStoryTable.findAll('tr')
		if len(trs) != 3:
			raise Exception(
				f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}'
			)

		fic.description = trs[1].find('td').getText().strip()

		reviewsMatch = re.search(
			'\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE
		)
		if reviewsMatch is None:
			edumpContent(str(trs[0]), 'sugarquill_reviews')
			raise Exception('could not locate reviews')

		fic.reviewCount = int(reviewsMatch.group(1).strip())

		updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2]))
		if updatedMatch is None:
			edumpContent(str(trs[2]), 'sugarquill_updated')
			raise Exception('could not locate last updated')

		fic.updated = OilTimestamp(
			util.parseDateAsUnix(updatedMatch.group(1), fic.fetched)
		)
		if fic.published is None:
			fic.published = fic.updated

		fic.wordCount = 0
		fic.upsert()

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			ch.title = chapterTitles[cid + 1]
			ch.cache()
			ch.upsert()
			chtml = ch.html()
			if chtml is not None:
				fic.wordCount += len(chtml.split())

		fic.add(Fandom.define('Harry Potter'))
		# TODO: chars/relationship?

		return fic
Example #14
0
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        w95tables = soup.findAll('table', {'width': '95%'})
        if len(w95tables) != 3:
            raise Exception('wrong number of w95 tables: {}'.format(
                len(w95tables)))

        ficInfoTable = w95tables[0]
        ficTitleH3 = ficInfoTable.find('h3')
        fic.title = ficTitleH3.get_text().strip()

        authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html)
        if authorUrlMatch is None:
            raise Exception('could not locate author url')

        author = authorUrlMatch.group(2)
        authorId = authorUrlMatch.group(1)
        authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId

        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO: this may miss multiline summaries :(
        summaryMatch = re.search(
            '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE)
        if summaryMatch is None:
            edumpContent(html, 'siye_summary')
            raise Exception('could not locate summary')
        # alternatively: fic.description = "{no summary}" ?

        fic.description = summaryMatch.group(1).strip()

        fic.ageRating = '<unkown>'

        ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html)
        if ageRatingMatch is not None:
            fic.ageRating = ageRatingMatch.group(1).strip()

        maxChapter = 0
        baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId)
        singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format(
            fic.localId)
        isSingleChapterFic = False
        allAs = soup.find_all('a')
        for a in allAs:
            href = a.get('href')
            if href is None:
                continue
            if not href.startswith(baseChapterHref):
                continue
            if href.startswith(singleChapterHref):
                isSingleChapterFic = True
                maxChapter = max(1, maxChapter)
                continue
            cid = int(href[len(baseChapterHref):])
            maxChapter = max(cid, maxChapter)

        fic.chapterCount = maxChapter

        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ficStatus = FicStatus.ongoing
        if html.find('Story is Complete'):
            fic.ficStatus = FicStatus.complete

        updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)')
        minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched)
        maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched)
        for (year, month, day) in re.findall(updatedOnPattern, html):
            date = '{}/{}/{}'.format(year, month, day)
            dt = util.parseDateAsUnix(date, fic.fetched)

            minUpdate = min(minUpdate, dt)
            maxUpdate = max(maxUpdate, dt)

        if fic.published is None or fic.published.toUTS() > minUpdate:
            fic.published = OilTimestamp(minUpdate)
        if fic.updated is None or fic.updated.toUTS() < maxUpdate:
            fic.updated = OilTimestamp(maxUpdate)
        if fic.updated < fic.published:
            fic.updated = fic.published

        fic.wordCount = 0
        wordsPattern = re.compile('(\d+) words')
        for (words) in re.findall(wordsPattern, html):
            fic.wordCount += int(words)

        if fic.wordCount == 0 and isSingleChapterFic:
            try:
                fic.upsert()
                ch1 = fic.chapter(1)
                ch1.cache()
                chtml = ch1.html()
                if chtml is not None:
                    fic.wordCount = len(chtml.split())
            except:
                pass

        fic.add(Fandom.define('Harry Potter'))
        # TODO: chars/relationship?

        return fic