Exemple #1
0
    def scrapeLike(self, url: str, delay: int = None) -> str:
        url = scrape.canonizeUrl(url)
        if delay is None:
            delay = self.defaultDelay
        prefix = self.baseUrl + 'threads/'
        if not url.startswith(prefix):
            data = scrape.softScrape(url,
                                     delay,
                                     mustyThreshold=self.mustyThreshold)
            if data is None:
                raise Exception('unable to soft scrape? FIXME')
            return data

        ulike = url[len(prefix):]
        parts = ulike.split('/')
        parts[0] = parts[0].split('.')[-1]
        canon = prefix + '/'.join(parts)
        parts[0] = '%.' + parts[0]
        ulike = prefix + '/'.join(parts)

        # FIXME canon may find an older url than ulike :/

        canonRes = scrape.getMostRecentScrapeWithMeta(canon)
        if (canonRes is not None and
                int(time.time()) - self.mustyThreshold < canonRes['fetched']):
            return cast(str, canonRes['raw'])

        data = scrape.softScrape(url,
                                 delay,
                                 ulike,
                                 mustyThreshold=self.mustyThreshold)
        if data is None:
            raise Exception('unable to soft scrape? FIXME')
        return data
Exemple #2
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.constructUrl(fic.localId)
		data = scrape.softScrape(fic.url)
		if data is None:
			raise Exception('unable to scrape? FIXME')

		fic = self.parseInfoInto(fic, data)
		fic.upsert()

		return Fic.lookup((fic.id, ))
 def getChapterPublishDate(self, url: str) -> OilTimestamp:
     from bs4 import BeautifulSoup
     url = self.canonizeUrl(url)
     data = scrape.softScrape(url)
     soup = BeautifulSoup(data, 'html5lib')
     publishTimes = soup.findAll('time',
                                 {'class': ['entry-date', 'published']})
     if len(publishTimes) != 1:
         raise Exception('cannot find publish time for {}'.format(url))
     uts = util.dtToUnix(
         dateutil.parser.parse(publishTimes[0].get('datetime')))
     return OilTimestamp(uts)
 def getChapterUrls(self, data: str = None) -> List[str]:
     from bs4 import BeautifulSoup  # type: ignore
     if data is None:
         data = scrape.softScrape(self.tocUrl)
     soup = BeautifulSoup(data, 'html5lib')
     entryContents = soup.findAll('div', {'class': 'entry-content'})
     chapterUrls: List[str] = []
     for entryContent in entryContents:
         aTags = entryContent.findAll('a')
         for aTag in aTags:
             if aTag.get('href') is None:
                 continue
             href = self.canonizeUrl(aTag.get('href'))
             if href in chapterUrls:
                 continue
             chapterUrls += [href]
     return chapterUrls
Exemple #5
0
    def create(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId, 1)

        # scrape fresh info
        data = scrape.softScrape(fic.url)
        if data is None:
            raise Exception('unable to scrape? FIXME')

        fic = self.parseInfoInto(fic, data)
        fic.upsert()

        chapter = fic.chapter(1)
        chapter.setHtml(data)
        chapter.localChapterId = str(1)
        chapter.url = self.constructUrl(fic.localId, 1)
        chapter.upsert()

        return Fic.lookup((fic.id, ))
    def softScrape(self, chapter: FicChapter) -> Optional[str]:
        import scrape
        html = scrape.softScrape(chapter.url)
        if html is None:
            return html
        # TODO well this is a nightmare...
        if html.find('You are being redirected') < 0:
            return html

        import re
        match = re.search("window.location = ['\"]([^'\"]*)['\"];", html)
        if match is None or match.group(1) is None:
            return html

        if chapter.url == match.group(1):
            raise Exception('redirect loop')

        chapter.url = match.group(1)
        chapter.upsert()
        return self.softScrape(chapter)
Exemple #7
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		raise Exception('FIXME TODO fanfics me format has changed')
		from bs4 import BeautifulSoup  # type: ignore
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		ficHead = soup.find('div', {'class': 'FicHead'})

		titleH1 = ficHead.find('h1')
		fic.title = titleH1.getText().strip()

		fandoms: List[str] = []
		trs = ficHead.findAll('div', {'class': 'tr'})
		author = None
		for tr in trs:
			divTitle = tr.find('div', {'class': 'title'})
			divContent = tr.find('div', {'class': 'content'})

			t = str(divTitle.getText()).strip()
			v = str(divContent.getText()).strip()

			if t == 'Автор:':
				author = v
			elif t == 'Фандом:':
				if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling':
					fandoms += ['Harry Potter']
				else:
					raise Exception('unknown fandom: ' + v)
			elif t == 'Статус:':
				if v == 'В процессе':
					fic.ficStatus = FicStatus.ongoing
				elif v == 'Закончен':
					fic.ficStatus = FicStatus.complete
				else:
					raise Exception('unknown write status: ' + v)
			elif t == 'Опубликован:':
				fic.published = self.parseRussianDate(v)
			elif t == 'Изменен:':
				fic.updated = self.parseRussianDate(v)
			elif t == 'Ссылка:':
				src = v  # source archive url
			elif t == 'Читателей:':
				fic.followCount = int(v)
			elif t == 'Персонажи:':
				# characters, parse relationship?
				pass
			elif t == 'Рейтинг:':
				fic.ageRating = v
			elif t == 'Предупреждения:':
				# warnings?
				pass
			else:
				raise Exception('unknown metadata: ' + t)

		# TODO?
		assert (author is not None)
		authorUrl = author
		authorId = author
		self.setAuthor(fic, author, authorUrl, authorId)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		if fic.url is None:
			fic.url = self.constructUrl(fic.localId)

		summaryTextDiv = soup.find('div', {'class': 'summary_text'})
		if summaryTextDiv is None:
			summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'})
		fic.description = summaryTextDiv.getText()

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		if fic.followCount is None:
			fic.followCount = 0

		fic.ageRating = 'M'

		ficContentsUl = soup.find('ul', {'class': 'FicContents'})
		chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'})
		fic.chapterCount = len(chapterLinks)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.localChapterId = str(cid)
			chapter.url = self.constructUrl(fic.localId, cid)

			# try to get it out of current blob first
			if chapter.html() is None:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapter.setHtml(
						'<div class="ReadContent">' + str(contentDiv) + '</div>'
					)

			if chapter.title is None or len(chapter.title) < 1:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapterTitle = contentDiv.previous_sibling
					if chapterTitle is not None and chapterTitle.name == 'h2':
						chapter.title = chapterTitle.getText()

			# fallback to scraping it directly
			if chapter.html() is None:
				cdata = scrape.softScrape(chapter.url)
				assert (cdata is not None)
				chapter.setHtml(self.extractContent(fic, cdata))
				csoup = BeautifulSoup(cdata, 'html5lib')
				contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)})
				chapterTitle = contentDiv.previous_sibling
				if chapterTitle is not None and chapterTitle.name == 'h2':
					chapter.title = chapterTitle.getText()

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()
			wordCount += len(chapter.cachedContent().split())

		fic.wordCount = wordCount

		for fandom in fandoms:
			fic.add(Fandom.define(fandom))

		return fic
Exemple #8
0
	def softScrape(self, chapter: FicChapter) -> Optional[str]:
		import scrape
		return scrape.softScrape(chapter.url)
Exemple #9
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        archive = fic.localId.split('/')[0]
        storyNo = fic.localId.split('/')[1]

        soup = BeautifulSoup(wwwHtml, 'html5lib')

        titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)})
        fic.title = str(titleH2.getText())

        membersUrl = 'http://members.adult-fanfiction.org/profile.php?no='
        memberLink = soup.find(
            lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href")
                       is not None and (t.get("href").startswith(membersUrl))))

        author = memberLink.getText()
        authorId = memberLink.get('href')[len(membersUrl):]
        authorUrl = memberLink.get('href')
        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO
        fic.ficStatus = FicStatus.ongoing

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: description is on search page
        if fic.description is None:
            fic.description = 'TODO: on the search page?'

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ageRating = 'M'

        # TODO
        if fic.published is None:
            fic.published = OilTimestamp.now()
        if fic.updated is None:
            fic.updated = fic.published

        chapterDropdown = soup.find('div', {'class': 'dropdown-content'})
        chapterLinks = chapterDropdown.findAll('a')
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterLinks)

        if fic.wordCount is None:
            fic.wordCount = 0
        fic.upsert()

        wordCount = 0
        for cid in range(1, fic.chapterCount + 1):
            chapterContent = scrape.softScrape(
                self.constructUrl(fic.localId, cid))
            chapter = fic.chapter(cid)
            if chapterContent is not None:
                chapter.setHtml(chapterContent)
            chapter.localChapterId = str(cid)
            chapter.url = self.constructUrl(fic.localId, cid)

            chapter.title = chapterLinks[cid - 1].getText().strip()
            if chapter.title is not None:
                chapter.title = util.cleanChapterTitle(chapter.title, cid)

            chapter.upsert()
            if chapterContent is not None:
                wordCount += len(chapterContent.split())

        fic.wordCount = wordCount

        if oldChapterCount is not None and oldChapterCount < fic.chapterCount:
            fic.updated = OilTimestamp.now()  # TODO
        fic.upsert()

        storyUrl = self.constructUrl(fic.localId, chapterId=None)

        # more metadata from search page
        searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                     'auth={}&title={}&summary=&tags=&cats=0&search=Search')
        searchUrl = searchUrl.format(archive, author,
                                     fic.title.replace(' ', '+'))
        data = scrape.scrape(searchUrl)['raw']

        metas = self.extractSearchMetadata(data)

        # fallback to pure author search
        if storyUrl not in metas:
            searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                         'auth={}&title=&summary=&tags=&cats=0&search=Search')
            searchUrl = searchUrl.format(archive, author)
            data = scrape.scrape(searchUrl)['raw']
            metas = self.extractSearchMetadata(data)

        if storyUrl not in metas:
            raise Exception('cannot find search metadata')

        meta = metas[storyUrl]

        assert (meta.published is not None and meta.updated is not None)
        fic.published = OilTimestamp(meta.published)
        fic.updated = OilTimestamp(meta.updated)

        fic.reviewCount = meta.reviewCount
        fic.favoriteCount = meta.views  # TODO

        fic.ficStatus = meta.ficStatus

        assert (meta.description is not None)
        fic.description = meta.description
        assert (fic.description is not None)
        if len(meta.tags) > 0:
            fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags)

        for fan in meta.fandoms:
            fic.add(Fandom.define(fan))

        return fic