Ejemplo n.º 1
def importFic(fdata):
	global ficImportRename
	ofic = inflateObject(fdata.copy(), ficImportRename)

	fic = Fic.new()
	for field in ofic:
		print('setting "{}" to "{}"'.format(field, ofic[field]))
		fic.__dict__[field] = ofic[field]

	fic.published = util.parseDateAsUnix(fic.published, int(time.time()))
	fic.updated = util.parseDateAsUnix(fic.updated, int(time.time()))
	print('setting "{}" to "{}"'.format('published', fic.published))
	print('setting "{}" to "{}"'.format('updated', fic.updated))

	print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId))


	for fandom in fdata['fandoms']:
		print('  adding fandom "{}"'.format(fandom))
	for character in fdata['characters']:
			'  adding character "{}" from fandom "{}"'.format(
				character['name'], character['fandom']
			Character.define(Fandom.define(character['fandom']), character['name'])
	for genre in fdata['genres']:
		print('  adding genre "{}"'.format(genre))
	for tag in fdata['tags']:
		print('  adding tag "{}"'.format(tag))

	cids = [int(cid) for cid in fdata['chapters']]
	for cid in cids:
		print('  adding chapter {}'.format(cid))
		ochap = fdata['chapters'][str(cid)]
		chapter = FicChapter.new()
		chapter.fic = fic
		chapter.ficId = fic.id
		chapter.chapterId = cid
		for field in ochap:
			chapter.__dict__[field] = ochap[field]
		contentPath = './content/{}/{}/{}/content.html'.format(
			fic.type, fic.localId, cid
		if os.path.isfile(contentPath):
			html = None
			with open(contentPath, 'r') as f:
				html = f.read()
			print('    has content: {}'.format(len(html)))
Ejemplo n.º 2
	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		titleHeadings = soup.findAll('h2', {'class': 'title heading'})
		if len(titleHeadings) != 1:
			raise Exception('unable to find ao3 title {}'.format(fic.url))
		fic.title = titleHeadings[0].get_text().strip()

		summaryModules = soup.findAll('div', {'class': 'summary module'})
		if len(summaryModules) != 1:
			prefaceGroups = soup.findAll('div', {'class': 'preface group'})
			if len(prefaceGroups) == 1:
				summaryModules = prefaceGroups[0].findAll(
					'div', {'class': 'summary module'}

		if len(summaryModules) == 1:
			summaryBq = summaryModules[0].find('blockquote')
			fic.description = summaryBq.decode_contents(formatter='html').strip()
		elif fic.description is None:
			fic.description = "{no summary}"
			# raise Exception('unable to find ao3 summary {}'.format(fic.localId))

		fic.ageRating = '<unkown>'

		# TODO: error handling
		cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip()
		ps = cText.split('/')
		completedChapters = int(ps[0])
		totalChapters = None if ps[1] == '?' else int(ps[1])
		fic.chapterCount = completedChapters

		wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip()
		fic.wordCount = int(wText)

		fic.reviewCount = 0

		fic.favoriteCount = 0
		kDefinition = soup.find('dd', {'class': 'kudos'})
		if kDefinition is not None:
			kText = ' '.join(kDefinition.contents).strip()
			fic.favoriteCount = int(kText)

		fic.followCount = 0

		pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip()
		publishedUts = util.parseDateAsUnix(pText, fic.fetched)
		fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		if fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		fic.ficStatus = FicStatus.ongoing  # TODO chapter/chapters?

		if totalChapters is None or completedChapters < totalChapters:
			fic.ficStatus = FicStatus.ongoing

		statusDt = soup.find('dt', {'class': 'status'})
		if statusDt is not None:
			if statusDt.contents[0] == 'Completed:':
				fic.ficStatus = FicStatus.complete
				cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(cText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
			elif statusDt.contents[0] == 'Updated:':
				fic.ficStatus = FicStatus.ongoing
				uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(uText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
				raise Exception('unkown status: {}'.format(statusDt.contents[0]))

		byline = soup.find('h3', {'class': 'byline heading'})
		authorLink = byline.find('a')
		if authorLink is None:
			if fic.authorId is not None and len(fic.getAuthorName()) > 0:
				pass  # updated author to anon, don't make changes
				# first loaded after it was already set to anonymous
				authorUrl = ''
				author = 'Anonymous'
				authorId = 'Anonymous'
				self.setAuthor(fic, author, authorUrl, authorId)
			authorUrl = authorLink.get('href')
			author = ' '.join(byline.find('a').contents)
			authorId = author  # map pseudo to real?
			self.setAuthor(fic, author, authorUrl, authorId)

		if fic.chapterCount > 1:
			localChapterIdSelect = soup.find(id='selected_id').findAll('option')
			# note: ao3 sometimes says there are less chapters than there really
			# are, possibly due to caching on their end. We just ensure there's _at
			# least_ chapterCount chapters, then fetch whatever the dropdown tells
			# us to
			if len(localChapterIdSelect) > fic.chapterCount:
				fic.chapterCount = len(localChapterIdSelect)
			if len(localChapterIdSelect) != fic.chapterCount:
				raise Exception('mismatching localChapterId count?')

			for cid in range(1, fic.chapterCount + 1):
				chap = fic.chapter(cid)
				chap.url = '{}{}/chapters/{}?view_adult=true'.format(
					self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value')
				chap.localChapterId = localChapterIdSelect[cid - 1].get('value')
				chap.title = localChapterIdSelect[cid - 1].getText().strip()
				if chap.title is not None:
					chap.title = util.cleanChapterTitle(chap.title, cid)

		fandomDd = soup.find('dd', {'class': 'fandom tags'})
		if fandomDd is not None:
			fandomTags = fandomDd.findAll('a', {'class': 'tag'})
			for ft in fandomTags:
				originalF = ft.contents[0].strip()
				f = originalF.lower()
				# TODO: this seriously needs reworked
				if (
					(f.startswith("harry potter ") and f.endswith("rowling"))
					or f == 'harry potter - fandom'
					or f == 'fantastic beasts and where to find them (movies)'
					or f == 'harry potter next generation - fandom'
					fic.add(Fandom.define('Harry Potter'))
				elif (
					f == 'sherlock - fandom' or f == 'sherlock (tv)'
					or f == 'sherlock holmes & related fandoms'
					or f == 'sherlock holmes - arthur conan doyle'
					or f == 'sherlock holmes (downey films)'
					fic.add(Fandom.define('Sherlock Holmes'))
				elif f == 'furry (fandom)' or f == 'harry - fandom':
					continue  # skip
				elif f == 'fleurmione - fandom':
					continue  # skip
				elif f == 'skyfall (2012) - fandom':
					fic.add(Fandom.define('James Bond'))
				elif f == 'orphan black (tv)':
					fic.add(Fandom.define('Orphan Black'))
				elif (
					f == 'naruto' or f == 'naruto shippuden'
					or f == 'naruto shippuuden - fandom'
				elif f == 'naruto/harry potter':
					fic.add(Fandom.define('Harry Potter'))
				elif f == 'bleach':
				elif (
					f == 'iron man (movies)' or f == 'iron man - all media types'
					or f == 'iron man (comic)' or f == 'iron man - fandom'
					or f == 'iron man (comics)'
					fic.add(Fandom.define('Iron Man'))
				elif (
					f == 'the avengers (marvel) - all media types'
					or f == 'the avengers (marvel movies)'
					or f == 'the avengers - ambiguous fandom'
					or f == 'the avengers (2012)' or f == 'the avengers'
					or f == 'avengers (marvel) - all media types'
					or f == 'marvel avengers movies universe' or f == 'avengers'
				elif f == 'marvel 616':
					fic.add(Fandom.define('Marvel 616'))
				elif f == 'thor (movies)' or f == 'thor - all media types':
				elif (
					f == 'captain america (movies)'
					or f == 'captain america - all media types'
					or f == 'captain america (comics)'
					fic.add(Fandom.define('Captain America'))
				elif (
					f == 'avatar: the last airbender' or f == 'avatar: legend of korra'
					or f == 'avatar the last airbender - fandom'
				elif f == 'original work':
					fic.add(Fandom.define('Original Work'))
				elif f == 'stargate atlantis':
					fic.add(Fandom.define('Stargate Atlantis'))
				elif f == 'stargate sg-1':
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'stargate - all series':
					fic.add(Fandom.define('Stargate Atlantis'))
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'agents of s.h.i.e.l.d. (tv)':
				elif f == 'supernatural':
				elif f == 'teen wolf (tv)':
					fic.add(Fandom.define('Teen Wolf'))
				elif f == 'grimm (tv)':
				elif (
					f == 'the amazing spider-man (movies - webb)'
					or f == 'spider-man - all media types'
					or f == 'spider-man: homecoming (2017)'
				elif (
					f == 'x-men - all media types' or f == 'x-men (movieverse)'
					or f == 'x-men (comicverse)'
				elif (
					f == 'lord of the rings - j. r. r. tolkien'
					or f == 'the lord of the rings - j. r. r. tolkien'
					fic.add(Fandom.define('Lord of the Rings'))
				elif (
					f == 'crisis core: final fantasy vii'
					or f == 'compilation of final fantasy vii' or f == 'final fantasy vii'
					fic.add(Fandom.define('Final Fantasy VII'))
					fic.add(Fandom.define('Final Fantasy'))
				elif f == 'sen to chihiro no kamikakushi | spirited away':
					fic.add(Fandom.define('Spirited Away'))
				elif f == 'howl no ugoku shiro | howl\'s moving castle':
					fic.add(Fandom.define('Howl\'s Moving Castle'))
				elif f == 'rise of the guardians (2012)':
					fic.add(Fandom.define('Rise of the Guardians'))
				elif (
					f == 'doctor who' or f == 'doctor who (2005)'
					or f == 'doctor who & related fandoms'
					fic.add(Fandom.define('Doctor Who'))
				elif f == 'daredevil (tv)' or f == 'daredevil (comics)':
				elif f == 'labyrinth (1986)':
				elif f == 'gravity falls':
					fic.add(Fandom.define('Gravity Falls'))
				elif f == 'once upon a time (tv)':
					fic.add(Fandom.define('Once Upon a Time'))
				elif f == 'doctor strange (comics)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'the sentinel':
					fic.add(Fandom.define('The Sentinel'))
				elif f == 'teen titans (animated series)':
					fic.add(Fandom.define('Teen Titans'))
				elif (
					f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)'
					or f == 'dc extended universe' or f == 'dc animated universe'
				elif f == 'vampire hunter d':
					fic.add(Fandom.define('Vampire Hunter D'))
				elif f == 'homestuck':
				elif f == 'one piece':
					fic.add(Fandom.define('One Piece'))
				elif f == 'batman (movies - nolan)':
				elif f == 'die hard (movies)':
					fic.add(Fandom.define('Die Hard'))
				elif f == 'discworld - terry pratchett':
				elif f == 'gossip girl':
					fic.add(Fandom.define('Gossip Girl'))
				elif (
					f == 'a song of ice and fire - george r. r. martin'
					or f == 'a song of ice and fire & related fandoms'
					fic.add(Fandom.define('A Song of Ice and Fire'))
				elif f == 'supergirl (tv 2015)':
				elif f == 'merlin (tv)':
				elif f == 'star trek':
					fic.add(Fandom.define('Star Trek'))
				elif f == 'steven universe (cartoon)':
					fic.add(Fandom.define('Steven Universe'))
				elif f == 'hellsing':
				elif f == 'the breaker':
					fic.add(Fandom.define('The Breaker'))
				elif f == 'smallville':
				elif f == '베리타스 | veritas (manhwa)':
					fic.add(Fandom.define('Veritas (manhwa)'))
				elif f == 'guardians of childhood - william joyce':
					fic.add(Fandom.define('Guardians of Childhood'))
				elif f == 'person of interest (tv)':
					fic.add(Fandom.define('Person of Interest'))
				elif f == 'james bond (craig movies)':
					fic.add(Fandom.define('James Bond'))
				elif f == 'the bourne legacy (2012)':
					fic.add(Fandom.define('Jason Bourne'))
				elif f == 'numb3rs':
				elif f == 'temeraire - naomi novik':
				elif f == 'twilight series - stephenie meyer':
				elif f == 'dungeons and dragons - fandom':
					fic.add(Fandom.define('Dungeons and Dragons'))
				elif f == 'american horror story' or f == 'american horror story: cult':
					fic.add(Fandom.define('American Horror Story'))
				elif (
					f == 'worm (web serial novel)' or f == 'worm - wildbow'
					or f == 'parahumans series - wildbow'
					or f == 'worm (web serial) | wildbow' or f == 'worm - fandom'
					or f == 'parahumans - fandom' or f == 'worm (parahumans)'
					or f == 'worm (web serial)' or f == 'worm | parahumans'
					or f == 'worm (web novel)'
				elif f == 'toaru kagaku no railgun | a certain scientific railgun':
					fic.add(Fandom.define('A Certain Scientific Railgun'))
				elif f == 'toaru majutsu no index | a certain magical index':
					fic.add(Fandom.define('A Certain Magical Index'))
				elif f == 'cthulhu mythos - h. p. lovecraft':
				elif f == 'transformers - all media types':
				elif f == 'destiny (video game)':
				elif f == 'fandom - fandom' or f == 'meta - fandom':
					pass  # >_>
				elif f == 'house m.d.':
					fic.add(Fandom.define('House, M.D.'))
				elif f == 'the hobbit (jackson movies)':
					fic.add(Fandom.define('The Hobbit'))
				elif f == 'doctor strange (2016)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'arrow (tv 2012)':
				elif f == 'the flash (tv 2014)':
				elif f == 'senki zesshou symphogear':
				elif (
					f == 'fullmetal alchemist: brotherhood & manga'
					or f == 'fullmetal alchemist - all media types'
					or f == 'fullmetal alchemist (anime 2003)'
					fic.add(Fandom.define('Fullmetal Alchemist'))
				elif (
					f == 'star wars - all media types'
					or f == 'star wars episode vii: the force awakens (2015)'
					or f == 'star wars prequel trilogy'
					fic.add(Fandom.define('Star Wars'))
				elif (
					f == 'guardians of the galaxy (2014)'
					or f == 'guardians of the galaxy - all media types'
					or f == 'guardians of the galaxy (movies)'
					fic.add(Fandom.define('Guardians of the Galaxy'))
				elif f == 'ant man (2015)' or f == 'ant-man (movies)':
					fic.add(Fandom.define('Ant Man'))
				elif f == 'the defenders (marvel tv)':
					fic.add(Fandom.define('The Defenders'))
				elif f == 'elementary (tv)':
				elif f == 'good omens - neil gaiman & terry pratchett':
					fic.add(Fandom.define('Good Omens'))
				elif f == 'danny phantom':
					fic.add(Fandom.define('Danny Phantom'))
				elif f == 'katekyou hitman reborn!':
					fic.add(Fandom.define('Katekyo Hitman Reborn!'))
				elif f == 'welcome to night vale':
					fic.add(Fandom.define('Welcome to Night Vale'))
				elif f == 'ncis':
				elif f == 'torchwood':
				elif f == 'magic: the gathering':
					fic.add(Fandom.define('Magic: The Gathering'))
				elif f == 'overwatch (video game)':
				elif f == 'detroit: become human (video game)':
					fic.add(Fandom.define('Detroit: Become Human'))
				elif f == 'greek and roman mythology':
				elif f == 'life is strange (video game)':
					fic.add(Fandom.define('life is strange (video game)'))
				elif f == 'akatsuki no yona | yona of the dawn':
					fic.add(Fandom.define('Yona of the Dawn'))
				elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia':
					fic.add(Fandom.define('My Hero Academia'))
				elif f == 'voltron: legendary defender':
				elif f == 'selfie (tv)':
				elif f == 'suits (tv)':
				elif f == 'fruits basket':
					fic.add(Fandom.define('Fruits Basket'))
				elif f == 'hetalia: axis powers':
					fic.add(Fandom.define('Hetalia: Axis Powers'))
				elif f == 'carmilla (web series)':
				elif f == 'the dresden files - jim butcher':
					fic.add(Fandom.define('Dresden Files'))
				elif f == 'girl genius':
					fic.add(Fandom.define('Girl Genius'))
				elif f == 'unspecified fandom':
					pass  # TODO?
				elif f == 'nightwing (comics)':
				elif f == 'books of the raksura - martha wells':
					fic.add(Fandom.define('Books of the Raksura'))
				elif f == 'fall of ile-rien - martha wells':
					fic.add(Fandom.define('Fall of Ile-Rien'))
				elif f == 'vorkosigan saga - lois mcmaster bujold':
					fic.add(Fandom.define('Vorkosigan Saga'))
				elif (
					f == 'highlander: the series' or f == 'highlander - all media types'
				elif f == 'yoroiden samurai troopers | ronin warriors':
					fic.add(Fandom.define('Ronin Warriors'))
				elif f == 'hockey rpf':
					fic.add(Fandom.define('Hockey RPF'))
				elif f == 'pacific rim (2013)':
					fic.add(Fandom.define('Pacific Rim'))
				elif f == 'enchanted forest chronicles - patricia wrede':
					fic.add(Fandom.define('Enchanted Forest Chronicles'))
				elif f == 'tortall - tamora pierce':
				elif f == 'protector of the small - tamora pierce':
					fic.add(Fandom.define('Protector of the Small'))
				elif f == 'leverage':
				elif f == 'valdemar series - mercedes lackey':
					fic.add(Fandom.define('Valdemar Series'))
				elif (
					f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense'
				elif f == 'hellboy (comic)':
				elif f == 'sga/avatar':
					fic.add(Fandom.define('Stargate Atlantis'))
				elif f == 'annihilation (2018 garland)':
				elif f == 'craft sequence - max gladstone':
					fic.add(Fandom.define('Craft Sequence'))
				elif f == 'the good place (tv)':
					fic.add(Fandom.define('The Good Place'))
				elif f == 'jessica jones (tv)':
					fic.add(Fandom.define('Jessica Jones'))
				elif f == 'mad max series (movies)':
					fic.add(Fandom.define('Mad Max'))
				elif f == 'american gods (tv)':
					fic.add(Fandom.define('American Gods'))
				elif f == 'terminator: the sarah connor chronicles':
					fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles'))
				elif f == 'wolf 359 (radio)':
					fic.add(Fandom.define('Wolf 359'))
				elif f == 'shadowrun: dragonfall':
				elif f == 'ars paradoxica (podcast)':
					fic.add(Fandom.define('Ars Paradoxica'))
				elif f == 'love is strange - fandom':
					fic.add(Fandom.define('Love is Strange'))
				elif f == 'dune - all media types':
				elif f == 'dragon age: origins':
					fic.add(Fandom.define('Dragon Age: Origins'))
				elif f == 'game of thrones (tv)':
					fic.add(Fandom.define('Game of Thrones'))
				elif f == 'chronicles of amber - roger zelazny':
					fic.add(Fandom.define('Chronicles of Amber'))
				elif f == 'the southern reach trilogy - jeff vandermeer':
					fic.add(Fandom.define('The Southern Reach Trilogy'))
				elif f == 'continuum (tv)':
				elif f == 'mage: the ascension':
					fic.add(Fandom.define('Mage: The Ascension'))
				elif f == 'the good wife (tv)' or f == 'good wife (tv)':
					fic.add(Fandom.define('The Good Wife'))
				elif f == 'alliance-union - c. j. cherryh':
				elif f == 'indexing - seanan mcguire':
				elif f == 'ultraviolet (tv)':
				elif f == 'veronica mars (tv)':
					fic.add(Fandom.define('Veronica Mars'))
				elif f == 'secret circle (tv)':
					fic.add(Fandom.define('Secret Circle'))
				elif f == 'mahou shoujo madoka magika | puella magi madoka magica':
					fic.add(Fandom.define('Madoka Magica'))
				elif f == 'agent carter (tv)':
					fic.add(Fandom.define('Agent Carter'))
				elif f == 'dracula & related fandoms':
				elif f == 'dragon ball':
					fic.add(Fandom.define('Dragon Ball'))
				elif f == 'mass effect - all media types':
					fic.add(Fandom.define('Mass Effect'))
				elif f == 'firefly' or f == 'serenity (2005)':
					anyHere = False
					global ao3FandomsMap
					for fm in ao3FandomsMap:
						here = False
						for uf in fm[0]:
							if f == uf.lower().strip():
								here = True
						if not here:
						anyHere = True
						for mf in fm[1]:
					if not anyHere:
						util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}')
						#raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF))

		ourDoms = fic.fandoms()
		# we have a canonical fandom, try to find our characters
		if len(ourDoms) == 1:
			relationshipDd = soup.find('dd', {'class': 'relationship tags'})
			if relationshipDd is not None:
				relationshipTags = relationshipDd.findAll('a', {'class': 'tag'})
				for rt in relationshipTags:
					r = rt.contents[0]
					chars = r.split('/')
					if len(chars) > 8:  # TODO: sometimes more?
						raise Exception('unable to parse relationship: {}'.format(r))
					for char in chars:
						fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype))

		return fic
Ejemplo n.º 3
	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		html = html.replace('\r\n', '\n')
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		infoPane = soup.findAll('td', {'class': 'info2_pane'})
		if len(infoPane) != 1:
			raise Exception('unable to find info2_pane: {}'.format(fic.url))
		infoPane = infoPane[0]

		authorHrefPrefix = 'index.php?action=profile&id='
		authorLinks = infoPane.findAll('a')
		authorUrl = None
		for authorLink in authorLinks:
			if not authorLink.get('href').startswith(authorHrefPrefix):

			authorUrl = self.baseUrl + '/' + authorLink.get('href')
			author = authorLink.getText()
			authorLocalId = authorLink.get('href')[len(authorHrefPrefix):]

			self.setAuthor(fic, author, authorUrl, authorLocalId)
			raise Exception('unable to find author: {}'.format(fic.url))

		titleMatch = re.search(
			'<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE
		if titleMatch is None:
			edumpContent(str(infoPane), 'sugarquill_title')
			raise Exception('could not locate title')

		fic.title = titleMatch.group(1).replace('&nbsp;', ' ').strip()

		chapterOptions = infoPane.findAll('option')
		chapterTitles = {}
		for chapterOption in chapterOptions:
			cid = int(chapterOption.get('value'))
			chapterTitles[cid] = chapterOption.getText().strip()
		fic.chapterCount = len(chapterOptions)

		fic.ageRating = '<unkown>'  # TODO
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ficStatus = FicStatus.ongoing  # TODO: no uniform way to detect?

		authorProfileHtml = scrape.scrape(authorUrl)['raw']
		authorProfileHtml = authorProfileHtml.replace('\r', '')
		authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib')

		storyTables = authorSoup.findAll('table', {'width': '90%'})
		ourStoryTable = None
		for storyTable in storyTables:
			storyId = None
			for a in storyTable.findAll('a'):
				if not a.get('href').startswith('read.php?storyid='):
				storyId = a.get('href')[len('read.php?storyid='):]
				storyId = storyId[:storyId.find('&')]
				storyId = str(int(storyId))
			if storyId is None:
			if storyId != str(fic.localId):
			ourStoryTable = storyTable
		if ourStoryTable is None:
			raise Exception(f'unable to find story table: {fic.localId} {authorUrl}')

		trs = ourStoryTable.findAll('tr')
		if len(trs) != 3:
			raise Exception(
				f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}'

		fic.description = trs[1].find('td').getText().strip()

		reviewsMatch = re.search(
			'\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE
		if reviewsMatch is None:
			edumpContent(str(trs[0]), 'sugarquill_reviews')
			raise Exception('could not locate reviews')

		fic.reviewCount = int(reviewsMatch.group(1).strip())

		updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2]))
		if updatedMatch is None:
			edumpContent(str(trs[2]), 'sugarquill_updated')
			raise Exception('could not locate last updated')

		fic.updated = OilTimestamp(
			util.parseDateAsUnix(updatedMatch.group(1), fic.fetched)
		if fic.published is None:
			fic.published = fic.updated

		fic.wordCount = 0

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			ch.title = chapterTitles[cid + 1]
			chtml = ch.html()
			if chtml is not None:
				fic.wordCount += len(chtml.split())

		fic.add(Fandom.define('Harry Potter'))
		# TODO: chars/relationship?

		return fic
Ejemplo n.º 4
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		authorLid = fic.localId.split('/')[0]
		storyLid = fic.localId.split('/')[1]

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'

		soup = BeautifulSoup(wwwHtml, 'html5lib')

		pageHeader = soup.find('div', {'class': 'page-header'})
		titleH2 = pageHeader.find('h2')
		fic.title = titleH2.getText().strip()

		authorLink = pageHeader.find('a')
		author = authorLink.getText().strip()
		authorId = authorLid
		authorUrl = self.baseStoryUrl.format(authorLid, 'contact/')
		self.setAuthor(fic, author, authorUrl, authorId)

		divWell = soup.find('div', {'class': 'well'})

		summaryQuote = divWell.find('blockquote')

		fic.description = str(
		).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
		while fic.description.find('  ') != -1:
			fic.description = fic.description.replace('  ', ' ')
		fic.description = fic.description.strip()

		divWellText = divWell.getText().strip()

		match = re.search('Status:\s*([^-]*) -', divWellText)
		if match is not None and match.group(1) == 'In progress':
			fic.ficStatus = FicStatus.ongoing
			raise Exception('unable to find fic status')

			divWellText, {
				'ageRating': ('Rating\s*:\s+([^-]+) -', str),
				'chapterCount': ('Chapters\s*:\s+(\d+) -', int),
				'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
		assert (fic.chapterCount is not None)

		if str(fic.wordCount).find(',') != -1:
			fic.wordCount = int(str(fic.wordCount).replace(',', ''))

		wellParent = divWell.parent
		cid = 0
		wordCount = 0
		reviewCount = 0
		chapterDates: List[int] = []

		for child in wellParent.children:
			if child.name != 'p': continue
			cid += 1
			if str(child).find('Chapter {}'.format(cid)) == -1:
			chapterLink = child.find('a')
			expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower()
			if chapterLink.get('href').lower() != expectedUrl:
				raise Exception('unexpected chapter url: ' + chapterLink.get('href'))

			chInfo = ChapterInfo()

				child.getText(), {
					'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
					'reviewCount': ('Reviews\s*:\s+([^-]+) -', int),
					'updated': ('Uploaded on\s*:\s+(.+)', str),
			assert (chInfo.updated is not None)

			if str(chInfo.wordCount).find(',') != -1:
				chInfo.wordCount = int(str(chInfo.wordCount).replace(',', ''))

			wordCount += chInfo.wordCount
			reviewCount += chInfo.reviewCount

			dt = (util.parseDateAsUnix(chInfo.updated, int(time.time())))
			chapterDates += [dt]

		# wordCount is already set from overall metadata
		fic.reviewCount = reviewCount

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))

		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = 'Chapter_{}'.format(cid)
			ch.url = self.constructUrl(fic.localId, cid)

		return fic
Ejemplo n.º 5
	def parseRussianDate(self, datestr: str) -> OilTimestamp:
		parts = datestr.split('.')
		dtstr = '{}.{}.{}'.format(parts[1], parts[0], parts[2])
		uts = util.parseDateAsUnix(dtstr, int(time.time()))
		return OilTimestamp(uts)
Ejemplo n.º 6
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicTexts = [
			# probably deleted by user
			'Story Not FoundUnable to locate story. Code 1.',
			# probably deleted by admin
			'Story Not FoundUnable to locate story. Code 2.',
			# unknown
			'Story Not FoundStory is unavailable for reading. (A)',
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				for deletedFicText in deletedFicTexts:
					if gui_warning.get_text() == deletedFicText:
						if fic.ficStatus != FicStatus.complete:
							fic.ficStatus = FicStatus.abandoned
						return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		descriptionFound = False
		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
				fic.description = div.get_text()
				descriptionFound = True
		if descriptionFound == False:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		# TODO we should match this only on the section following the description
		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Rated:.*Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text
		if match is None:
			fic.ficStatus = FicStatus.ongoing
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = []
		if preStoryLinks is not None:
			preStoryLinksLinks = preStoryLinks.find_all('a')
		pendingFandoms: List[Fandom] = []
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
				cat = hrefParts[1]
				if cat in ffNetFandomCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/
			if (
				len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers")
				and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0
				fIds = [int(hrefParts[2]), int(hrefParts[3])]
				pendingFandoms += self.handleCrossoverFandom(
					fic, hrefParts[1], fIds, href

			# if it's a regular fandom in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
				# ensure category is in our map
				if hrefParts[1] not in ffNetFandomCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				pendingFandoms += self.handleFandom(fic, hrefParts[2])

			util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href))

		poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId})
		if len(poss) != 1:
			raise Exception(f'unable to upsert fic?')
		fic = poss[0]
		for pfandom in pendingFandoms:

		if fic.chapterCount is None:
			return fic

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = str(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid)
			elif fic.chapterCount == 1 and cid == 1:
				ch.title = fic.title

		metaSpan = profile_top.find('span', {'class': 'xgray'})
		if metaSpan is not None:
				res = self.parseFicMetaSpan(metaSpan.decode_contents())
				#fic.language = res["language"]

				# reconstruct
				fields = [
					('rated', 'Rated: Fiction ZZZ'),
					('language', 'Language: ZZZ'),
					('genres', 'Genre: ZZZ'),
					('characters', 'Characters: ZZZ'),
					('reviews', 'Reviews: ZZZ'),
					('favorites', 'Favs: ZZZ'),
					('follows', 'Follows: ZZZ'),
				rmeta = ' - '.join(
					[f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res]

				fic.extraMeta = rmeta
				publishedUts = util.parseDateAsUnix(res['published'], fic.fetched)
				fic.published = OilTimestamp(publishedUts)
				fic.updated = fic.published
				if 'updated' in res:
					updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched)
					fic.updated = OilTimestamp(updatedUts)

			except Exception as e:
					f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}'
					f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}'

		return fic
Ejemplo n.º 7
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        w95tables = soup.findAll('table', {'width': '95%'})
        if len(w95tables) != 3:
            raise Exception('wrong number of w95 tables: {}'.format(

        ficInfoTable = w95tables[0]
        ficTitleH3 = ficInfoTable.find('h3')
        fic.title = ficTitleH3.get_text().strip()

        authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html)
        if authorUrlMatch is None:
            raise Exception('could not locate author url')

        author = authorUrlMatch.group(2)
        authorId = authorUrlMatch.group(1)
        authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId

        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO: this may miss multiline summaries :(
        summaryMatch = re.search(
            '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE)
        if summaryMatch is None:
            edumpContent(html, 'siye_summary')
            raise Exception('could not locate summary')
        # alternatively: fic.description = "{no summary}" ?

        fic.description = summaryMatch.group(1).strip()

        fic.ageRating = '<unkown>'

        ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html)
        if ageRatingMatch is not None:
            fic.ageRating = ageRatingMatch.group(1).strip()

        maxChapter = 0
        baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId)
        singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format(
        isSingleChapterFic = False
        allAs = soup.find_all('a')
        for a in allAs:
            href = a.get('href')
            if href is None:
            if not href.startswith(baseChapterHref):
            if href.startswith(singleChapterHref):
                isSingleChapterFic = True
                maxChapter = max(1, maxChapter)
            cid = int(href[len(baseChapterHref):])
            maxChapter = max(cid, maxChapter)

        fic.chapterCount = maxChapter

        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ficStatus = FicStatus.ongoing
        if html.find('Story is Complete'):
            fic.ficStatus = FicStatus.complete

        updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)')
        minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched)
        maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched)
        for (year, month, day) in re.findall(updatedOnPattern, html):
            date = '{}/{}/{}'.format(year, month, day)
            dt = util.parseDateAsUnix(date, fic.fetched)

            minUpdate = min(minUpdate, dt)
            maxUpdate = max(maxUpdate, dt)

        if fic.published is None or fic.published.toUTS() > minUpdate:
            fic.published = OilTimestamp(minUpdate)
        if fic.updated is None or fic.updated.toUTS() < maxUpdate:
            fic.updated = OilTimestamp(maxUpdate)
        if fic.updated < fic.published:
            fic.updated = fic.published

        fic.wordCount = 0
        wordsPattern = re.compile('(\d+) words')
        for (words) in re.findall(wordsPattern, html):
            fic.wordCount += int(words)

        if fic.wordCount == 0 and isSingleChapterFic:
                ch1 = fic.chapter(1)
                chtml = ch1.html()
                if chtml is not None:
                    fic.wordCount = len(chtml.split())

        fic.add(Fandom.define('Harry Potter'))
        # TODO: chars/relationship?

        return fic
Ejemplo n.º 8
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        divDetails = soup.find_all('div', {'class': 'details'})
        if len(divDetails) != 1:
            raise Exception('error: unable to find details\n')
            divDetails = divDetails[0]

        text = divDetails.get_text()
        pt_str = str(divDetails)

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        divTitle = soup.find_all('div', {'class': 'title'})
        if len(divTitle) == 1:
            fic.title = divTitle[0].get_text().strip()
            raise Exception(
                'error: unable to find title:\n{}\n'.format(pt_str))

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: this may not exist on fictionhunt?
        fic.description = 'archive of {} from fictionhunt TODO'.format(

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rated:\s+(\S+)', str),
                'chapterCount?': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\S+)', int),
                'reviewCount?': ('Reviews:\s+(\S+)', int),
                'favoriteCount?': ('Favs:\s+(\S+)', int),
                'followCount?': ('Follows:\s+(\S+)', int),
                'updated?': ('Updated:\s+(\S+)', str),
                'published': ('Published:\s+(\S+)', str),

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('- Complete -', text)
        if match is None:
            fic.ficStatus = FicStatus.ongoing
            fic.ficStatus = FicStatus.complete

        for a in divDetails.find_all('a'):
            a_href = a.get('href')
            if a_href.find('fanfiction.net/u/') != -1:
                author = a.get_text()
                authorUrl = a_href
                authorId = a_href.split('/')[-1]
                self.setAuthor(fic, author, authorUrl, authorId)
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: hardcode Harry Potter fanfic?

        return fic
Ejemplo n.º 9
	def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic:
		# existing data is newer, do nothing
		if fic.fetched is not None and fic.fetched.toUTS() > ts:
			return fic
		from bs4 import BeautifulSoup

		soup = BeautifulSoup(html, 'html5lib')

		text = soup.get_text()
		pt_str = str(html)

		fic.fetched = OilTimestamp(ts)
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		for a in soup.find_all('a', {'class': 'stitle'}):
			fic.title = a.getText()
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		for div in soup.find_all('div', {'class': 'z-padtop'}):
			fic.description = div.contents[0]
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text
		if match is None:
			fic.ficStatus = FicStatus.ongoing
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in soup.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
			raise Exception('unable to find author:\n{}'.format(text))

		zl = soup.find('div', {'class': 'z-list'})
		fan = None if zl is None else zl.get('data-category')
		pendingFandoms: List[Fandom] = []
		if fan is not None:
			pendingFandoms += self.handleFandom(fic, fan)
			# TODO: crossovers?

		#raise Exception('todo')

		for pfandom in pendingFandoms:

		return fic
Ejemplo n.º 10
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'  # TODO?

		ficTitleDiv = soup.find('div', {'class': 'fic-title'})
		fic.title = ficTitleDiv.find('h1').getText().strip()

		authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a')
		author = authorLink.getText().strip()
		authorUrl = self.baseUrl + authorLink.get('href')
		authorId = authorUrl.split('/')[-1]
		self.setAuthor(fic, author, authorUrl, authorId)

		divDescription = soup.find('div', {'class': 'description'})
			descView = HtmlView(str(divDescription), markdown=False)
			desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text])
			fic.description = desc
			fic.description = divDescription.getText().strip()

		fictionInfo = str(soup.find('div', {'class': 'fiction-info'}))
		if fictionInfo.find('>ONGOING<') != -1:
			fic.ficStatus = FicStatus.ongoing
		elif fictionInfo.find('>COMPLETED<') != -1:
			fic.ficStatus = FicStatus.complete
		elif fictionInfo.find('>HIATUS<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>STUB<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>DROPPED<') != -1:
			fic.ficStatus = FicStatus.abandoned
			raise Exception('unable to find fic status')

		divStatsContent = soup.find('div', {'class': 'stats-content'})
		followers = divStatsContent.find(text='Followers :')
		ul = followers.parent.parent

			ul.getText(), {
				'followCount?': ('Followers\s+:\s+([\d,]+)', str),
				'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str),

		if str(fic.followCount).find(','):
			fic.followCount = int(str(fic.followCount).replace(',', ''))
		if str(fic.favoriteCount).find(','):
			fic.favoriteCount = int(str(fic.favoriteCount).replace(',', ''))

		tableChapters = soup.find('table', {'id': 'chapters'})
		chapterLinks = tableChapters.findAll('a')

		chapterUrls: List[str] = []
		chapterTitles: List[str] = []
		for chapterLink in chapterLinks:
			# TODO FIXME is this inverted?
			if chapterLink.find('time') is not None:
			chapterUrls += [chapterLink.get('href')]
			chapterTitles += [chapterLink.getText().strip()]

		chapterDates: List[int] = []
		for chapterLink in chapterLinks:
			if chapterLink.find('time') is None:
			timeElement = chapterLink.find('time')
			if timeElement.get('unixtime'):
				chapterDates += [int(timeElement.get('unixtime'))]
				chapterDates += [
					util.parseDateAsUnix(timeElement.get('title'), fic.fetched)

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))
		fic.chapterCount = len(chapterUrls)

		if fic.wordCount is None:
			fic.wordCount = 0

		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.url = self.baseUrl + chapterUrls[cid - 1]
			if chapterUrls[cid - 1].startswith('/fiction/chapter/'):
				# alternate chapter syntax if the chapter itself has no slug
				# /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug>
				chapter.localChapterId = (
					chapterUrls[cid - 1].split('/')[3].split('?')[0]
				# standard chapter syntax
				# /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug>
				chapter.localChapterId = chapterUrls[cid - 1].split('/')[5]
			chapter.title = chapterTitles[cid - 1]

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)


		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			if chapter.html() is None:

			chtml = chapter.html()
			if chtml is not None:
				wordCount += len(chtml.split())

		fic.wordCount = wordCount

		return fic
Ejemplo n.º 11
    def extractSearchMetadata(
        html: str,
        metas: Dict[str, AdultFanfictionMeta] = {}
    ) -> Dict[str, AdultFanfictionMeta]:
        from bs4 import BeautifulSoup
        archiveFandomMap = {
            'naruto': 'Naruto',
            'hp': 'Harry Potter',
            'xmen': 'X-Men',
        locatedFandomMap = [
            ('Mass Effect', 'Mass Effect'),
            ('Metroid', 'Metroid'),
            ('Pokemon', 'Pokemon'),
            ('Sonic', 'Sonic'),
            ('Witcher 3: Wild Hunt', 'Witcher'),
        chars = [
            'Harry', 'Hermione', 'Snape', 'Draco', 'Sirius', 'Remus', 'Lucius',
            'Ron', 'Voldemort', 'Ginny', 'Charlie', 'Lily', 'Scorpius',
            'James', 'George', 'Fred', 'Narcissa', 'Blaise', 'Bill', 'Luna',
            'Albus', 'Severus', 'Fenrir', 'Tonks', 'Rose', 'Neville', 'Cho',
            'Cedric', 'Tom', 'Seamus', 'Pansy', 'Bellatrix', 'Viktor', 'Percy',
            'Dudley', 'McGonagall', 'Lavendar', 'Dumbledore', 'Naruto',
            'Sasuke', 'Kakashi', 'Iruka', 'Sakura', 'Itachi', 'Gaara',
            'Shikamaru', 'Neji', 'Rock Lee', 'Hinata', 'Ino', 'Shino', 'Danzo'

        spaceSqeeezeRe = re.compile('\s+')

        searchSoup = BeautifulSoup(html, 'html5lib')
        resultTables = searchSoup.findAll('table', {'width': '90%'})
        for resultTable in resultTables:
            meta = AdultFanfictionMeta()

            links = resultTable.findAll('a')
            titleLink = links[0]
            meta.title = titleLink.getText()
            meta.url = titleLink.get('href')

            authorLink = links[1]
            meta.author = authorLink.getText().strip()
            meta.authorUrl = authorLink.get('href').strip()
            assert (meta.authorUrl is not None)
            meta.authorId = meta.authorUrl.split('=')[-1]

            trs = resultTable.findAll('tr')

            publishedText = trs[0].getText()
            RegexMatcher(publishedText, {
                'published': ('Published\s+:\s+(.+)', str),
            assert (meta.published is not None)
            meta.published = util.parseDateAsUnix(meta.published,

            extendedMetadata = trs[1].getText()
            util.logMessage(extendedMetadata, 'tmp_e_meta_aff.log')
            # TODO: dragon prints are actually views, not followCount/favoriteCount
                extendedMetadata, {
                    'chapterCount': ('Chapters\s*:\s*(\d+)', int),
                    'updated': ('Updated\s+:\s+(.+?)-:-', str),
                    'reviewCount?': ('Reviews\s+:\s+(\d+)', int),
                    'views?': ('Dragon prints\s+:\s+(\d+)', int),
                    'located?': ('Located\s*:\s*(.*)', str)
            assert (meta.updated is not None)
            meta.updated = util.parseDateAsUnix(meta.updated, int(time.time()))

            meta.description = str(trs[2])
            meta.description = util.filterUnicode(meta.description)
            meta.description = spaceSqeeezeRe.sub(' ', meta.description)


            if 'COMPLETE' in meta.tags or 'Complete.' in meta.tags:
                meta.ficStatus = FicStatus.complete

            assert (meta.url is not None)
            ficId = FicId.tryParseUrl(meta.url)
            assert (ficId is not None)
            meta.localId = ficId.localId
            meta.archive = meta.localId.split('/')[0]
            meta.storyNo = meta.localId.split('/')[1]
            if meta.archive.lower() in archiveFandomMap:
                meta.fandoms += [archiveFandomMap[meta.archive.lower()]]

            meta.located = meta.located or ''
            loclow = meta.located.lower()

            for locFan in locatedFandomMap:
                if loclow.endswith(locFan[0].lower()):
                    meta.fandoms += [locFan[1]]

            for c1 in chars:
                for c2 in chars:
                    if loclow.endswith('{}/{}'.format(c1, c2).lower()):
                        meta.chars += [c1, c2]

            # TODO: try parse category, get chars

            if meta.url not in metas or meta.isNewerThan(metas[meta.url]):
                metas[meta.url] = meta

        return metas
Ejemplo n.º 12
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        pagetitle = soup.find(id='pagetitle')
        aTags = pagetitle.findAll('a')
        author = None
        for a in aTags:
            href = a.get('href')
            if href.startswith('viewstory'):
                fic.title = a.contents[0].strip()
            elif href.startswith('viewuser.php?uid='):
                author = a.contents[0]
                authorUrl = self.baseUrl + href
                authorId = str(int(href[len('viewuser.php?uid='):]))
                self.setAuthor(fic, author, authorUrl, authorId)

        if fic.title is None:
            raise Exception('unable to find title')
        if author is None:
            raise Exception('unable to find author')

        lines = html.replace('\r', '\n').replace('<', '\n<').split('\n')
        inDescription = False
        description = ''
        for line in lines:
            cur = line.strip()
            if cur.find('!-- SUMMARY START --') != -1:
                inDescription = True
            elif cur.find('!-- SUMMARY END --') != -1:
                inDescription = False

            if inDescription == True:
                description += cur + '\n'

        fic.description = description

        fic.ageRating = '<unkown>'

        infoBlock = None
        infoText = None
        blocks = soup.findAll('div', {'class': 'block'})
        for block in blocks:
            title = block.find('div', {'class': 'title'})
            if title is None:
            if title.contents[0] != 'Story Information':
            infoBlock = block
            infoText = block.get_text()
            raise Exception('unable to find info text')

        matcher = RegexMatcher(
            infoText, {
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Word count:\s+(\S+)', int),

        sortDiv = soup.find(id='sort')
        match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text())
        if match is not None:
            fic.reviewCount = int(match.group(1).replace(',', ''))
            fic.reviewCount = 0

        fic.favoriteCount = 0
        fic.followCount = 0

        infoBlockHtml = str(infoBlock)
        match = re.search(
            '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->',
        if match is not None:
            publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->',
        if match is not None:
            updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.updated is None:
            fic.updated = fic.published

        match = re.search('Completed:\s+(\S+)', infoText)
        if match is not None:
            complete = match.group(1)
            if complete == 'No':
                fic.ficStatus = FicStatus.ongoing
            elif complete == 'Yes':
                fic.ficStatus = FicStatus.complete
                raise Exception('unknown complete value: {}'.format(complete))

        match = re.search('Crossovers', infoText)
        if match is not None:
            pass  # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url))
            # otherwise not a crossover and just harry potter
            fic.add(Fandom.define('Harry Potter'))

        return fic
Ejemplo n.º 13
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'})
        if len(storyMainInfo) != 1:
            raise Exception('unable to find main story info')
        storyMainInfo = storyMainInfo[0]

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid="
        for a in soup.findAll('a'):
            href = a.get('href')
            if (not href.startswith(disclaimerJs)
                    and href != '?psid={}'.format(fic.localId)):
            fic.title = a.getText()
            raise Exception('error: unable to find title')

        fic.url = self.constructUrl(fic.localId)

        storySummaryTable = soup.findAll('table', {'class': 'storysummary'})
        if len(storySummaryTable) != 1:
            raise Exception('cannot find story summary table')
        storySummaryTable = storySummaryTable[0]
        fic.description = (storySummaryTable.getText().strip())
        if fic.description is None:
            raise Exception('error: unable to find description')

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        text = storyMainInfo.getText().replace('\xa0', ' ')
        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str),
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\d+)', int),
                'reviewCount': ('Story Reviews:\s*(\d+)', int),
                'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int),
                'updated': ('Last Updated:\s+(\S+)', str),
                'published': ('First Published:\s+(\S+)', str),

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)',
        if match is None:
            raise Exception('cannot find write status')

        status = match.group(1)
        if status == 'Completed':
            fic.ficStatus = FicStatus.complete
        elif status == 'Work In Progress':
            fic.ficStatus = FicStatus.ongoing  # should these be abandoned?
        elif status == 'Abandoned':
            fic.ficStatus = FicStatus.abandoned
            raise Exception('unknown status: {}'.format(status))

        for a in soup.findAll('a'):
            a_href = a.get('href')
            if a_href.startswith('viewuser.php?showuid='):
                author = a.get_text()
                authorUrl = self.baseUrl + '/' + a_href
                authorId = a_href[len('viewuser.php?showuid='):]
                self.setAuthor(fic, author, authorUrl, authorId)
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: chars/pairings?
        fic.add(Fandom.define('Harry Potter'))
        return fic
Ejemplo n.º 14
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicText = 'Story Not FoundUnable to locate story. Code 1.'
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				if gui_warning.get_text() == deletedFicText:
					fic.ficStatus = FicStatus.abandoned
					return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
				fic.description = div.get_text()
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+(\S+)', str),

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search('Status:\s+(\S+)', text)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
			status = match.group(1)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
				raise Exception('unknown status: {}'.format(status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = preStoryLinks.find_all('a')
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
				cat = hrefParts[1]
				if cat in fictionPressCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a regular genre in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
				# ensure category is in our map
				if hrefParts[1] not in fictionPressCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				# ensure it's in our whitelist
				if hrefParts[2] not in fictionPressGenres:
					util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}')


			util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}')


		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1)
			elif fic.chapterCount == 1 and cid == 0:
				ch.title = fic.title

		return fic