def importFic(fdata): global ficImportRename ofic = inflateObject(fdata.copy(), ficImportRename) fic = Fic.new() for field in ofic: print('setting "{}" to "{}"'.format(field, ofic[field])) fic.__dict__[field] = ofic[field] fic.published = util.parseDateAsUnix(fic.published, int(time.time())) fic.updated = util.parseDateAsUnix(fic.updated, int(time.time())) print('setting "{}" to "{}"'.format('published', fic.published)) print('setting "{}" to "{}"'.format('updated', fic.updated)) print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId)) fic.insert() for fandom in fdata['fandoms']: print(' adding fandom "{}"'.format(fandom)) fic.add(Fandom.define(fandom)) for character in fdata['characters']: print( ' adding character "{}" from fandom "{}"'.format( character['name'], character['fandom'] ) ) fic.add( Character.define(Fandom.define(character['fandom']), character['name']) ) for genre in fdata['genres']: print(' adding genre "{}"'.format(genre)) fic.add(Genre.define(genre)) for tag in fdata['tags']: print(' adding tag "{}"'.format(tag)) fic.add(Tag.define(tag)) cids = [int(cid) for cid in fdata['chapters']] cids.sort() for cid in cids: print(' adding chapter {}'.format(cid)) ochap = fdata['chapters'][str(cid)] chapter = FicChapter.new() chapter.fic = fic chapter.ficId = fic.id chapter.chapterId = cid for field in ochap: chapter.__dict__[field] = ochap[field] contentPath = './content/{}/{}/{}/content.html'.format( fic.type, fic.localId, cid ) if os.path.isfile(contentPath): html = None with open(contentPath, 'r') as f: html = f.read() print(' has content: {}'.format(len(html))) chapter.setHtml(html) chapter.insert()
def handleFandom(self, fic: Fic, fandom: str) -> List[Fandom]: # save raw/messy fandom fandoms = [Fandom.define(fandom, sourceId=self.ftype)] # ensure messy is in our map if fandom not in ffNetFandomMap: util.logMessage('unknown fandom: {} (from {})'.format(fandom, fic.url)) else: fandoms.append(Fandom.define(ffNetFandomMap[fandom])) return fandoms
def importDB(data): for fandom in data['fandoms']: Fandom.define(fandom) for character in data['characters']: Character.define(Fandom.define(character['fandom']), character['name']) for genre in data['genres']: Genre.define(genre) for tag in data['tags']: Tag.define(tag) ficKeys = [key for key in data['fics']] ficKeys.sort() for key in ficKeys: here = data['fics'][key] importFic(here)
def updateTitle(self, fic: Fic) -> None: if fic.title is None: return completeTags = ['complete', 'completed'] # look for Complete tag in the title for cont in self.containers: for completeTag in completeTags: ctag = cont[0] + completeTag + cont[1] cloc = fic.title.lower().find(ctag) if cloc != -1: fic.title = fic.title[:cloc] + fic.title[cloc + len(ctag):] fic.ficStatus = FicStatus.complete fic.title = fic.title.strip() fic.title = fic.title.replace(' ', ' ') # strip '[nsfw]' tag from anywhere in title for cont in self.containers: ntag = cont[0] + 'nsfw' + cont[1] nloc = fic.title.lower().find(ntag) if nloc != -1: fic.title = fic.title[:nloc] + fic.title[nloc + len(ntag):] fic.ageRating = 'M' # TODO? fic.title = fic.title.strip() fic.title = fic.title.replace(' ', ' ') res = self.cleanTitle(fic.title) fic.title = res[0] for fan in res[1]: fic.add(Fandom.define(fan)) for tag in res[2]: fic.add(Tag.define(tag)) fic.upsert()
def handleCrossoverFandom( self, fic: Fic, fandom: str, fIds: List[int], href: str ) -> List[Fandom]: # save raw/messy fandom fandoms = [Fandom.define(fandom, sourceId=self.ftype)] # ensure fandom ids are in our map # check for missing id maps missingIds = [fId for fId in fIds if fId not in ffNetFandomIdMap] if len(missingIds) > 0: util.logMessage( 'unknown fandom ids: {} from {} in {}'.format( missingIds, href, fic.url ) ) return fandoms # translate to messy messys = [ffNetFandomIdMap[fId] for fId in fIds] # check for missing messy missingMessy = [m for m in messys if m not in ffNetFandomMap] if len(missingMessy) > 0: util.logMessage( 'unknown messy fandom: {} from {}'.format(missingMessy, href) ) return fandoms # check crossover value expected = '{}_and_{}_Crossovers'.format(messys[0], messys[1]) if expected != fandom: util.logMessage( 'crossover got "{}" expected "{}"'.format(fandom, expected) ) return fandoms # map messy to clean cleans = [ffNetFandomMap[m] for m in messys] for clean in cleans: if len(clean) > 0: fandoms.append(Fandom.define(clean)) return fandoms
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') # wooh hardcoding fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") fic.title = 'The Waves Arisen' fic.ageRating = 'M' self.setAuthor(fic, 'wertifloke', 'https://wertifloke.wordpress.com/', str(2)) # taken from https://www.parahumans.net/about/ fic.description = ''' A young Naruto found refuge in the village library, and grew up smart, but by blood he is Ninja, and what place is there for curiosity and calculation in this brutal world of warring states? The Waves Arisen is a complete novel-length work of Rationalist Naruto Fanfiction. No prior knowledge of the Naruto universe is necessary to follow along. ''' chapterUrls = self.getChapterUrls(html) oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterUrls) # TODO? fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 if fic.ficStatus is None or fic.ficStatus == FicStatus.broken: fic.ficStatus = FicStatus.ongoing fic.published = self.getChapterPublishDate(chapterUrls[0]) fic.updated = self.getChapterPublishDate(chapterUrls[-1]) if oldChapterCount is None or fic.chapterCount > oldChapterCount: fic.wordCount = 0 if fic.wordCount == 0: fic.upsert() for cid in range(1, fic.chapterCount + 1): c = fic.chapter(cid) c.cache() chtml = c.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Naruto')) # TODO: chars/relationship? return fic
def dumpDB(): data = {} fandomMap = {f.id: f for f in Fandom.select()} characterMap = {c.id: c for c in Character.select()} genreMap = {g.id: g for g in Genre.select()} tagMap = {t.id: t for t in Tag.select()} data['fandoms'] = [fandomMap[k].name for k in fandomMap] data['characters'] = [ { 'name': characterMap[k].name, 'fandom': fandomMap[characterMap[k].fandom_id].name } for k in characterMap ] data['genres'] = [genreMap[k].name for k in genreMap] data['tags'] = [tagMap[k].name for k in tagMap] data['fics'] = {} frename = {'id': None, 'chapters': 'chapterCount'} crename = { 'id': None, 'ficId': None, 'cid': None, 'raw': None, 'fic': None, 'lastLine': None } cdefaults = { 'line': 0, 'subLine': 0, 'notes': None, 'status': Status.ongoing, 'fetched': None, 'url': None } fics = Fic.select() for fic in fics: k = '{}/{}'.format(fic.type, fic.localId) o = fic.__dict__.copy() o = deflateObject(o, frename) o['fandoms'] = [f.name for f in fic.fandoms()] o['characters'] = [ { 'name': c.name, 'fandom': fandomMap[c.fandom_id].name } for c in fic.characters() ] o['tags'] = [t.name for t in fic.tags()] o['genres'] = [g.name for g in fic.genres()] co = {} ficChapters = FicChapter.select({'ficId': fic.id}) for chapter in ficChapters: here = chapter.__dict__.copy() ffNetUrl = 'https://www.fanfiction.net/s/{}/{}/{}'.format( fic.localId, chapter.chapterId, util.urlTitle(fic.title) ) cdefaults['url'] = ffNetUrl cdefaults['lastModified'] = here['fetched'] here = deflateObject(here, crename, cdefaults) co[chapter.chapterId] = here if chapter.raw is None: continue contentPath = './content/{}/{}/{}/'.format( fic.type, fic.localId, chapter.chapterId ) if not os.path.isdir(contentPath): os.makedirs(contentPath) with open(contentPath + 'content.html', 'w') as f: f.write(chapter.content()) o['chapters'] = co data['fics'][k] = o return data
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? infoPane = soup.findAll('td', {'class': 'info2_pane'}) if len(infoPane) != 1: raise Exception('unable to find info2_pane: {}'.format(fic.url)) infoPane = infoPane[0] authorHrefPrefix = 'index.php?action=profile&id=' authorLinks = infoPane.findAll('a') authorUrl = None for authorLink in authorLinks: if not authorLink.get('href').startswith(authorHrefPrefix): continue authorUrl = self.baseUrl + '/' + authorLink.get('href') author = authorLink.getText() authorLocalId = authorLink.get('href')[len(authorHrefPrefix):] self.setAuthor(fic, author, authorUrl, authorLocalId) break else: raise Exception('unable to find author: {}'.format(fic.url)) titleMatch = re.search( '<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE ) if titleMatch is None: edumpContent(str(infoPane), 'sugarquill_title') raise Exception('could not locate title') fic.title = titleMatch.group(1).replace(' ', ' ').strip() chapterOptions = infoPane.findAll('option') chapterTitles = {} for chapterOption in chapterOptions: cid = int(chapterOption.get('value')) chapterTitles[cid] = chapterOption.getText().strip() fic.chapterCount = len(chapterOptions) fic.ageRating = '<unkown>' # TODO fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing # TODO: no uniform way to detect? authorProfileHtml = scrape.scrape(authorUrl)['raw'] authorProfileHtml = authorProfileHtml.replace('\r', '') authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib') storyTables = authorSoup.findAll('table', {'width': '90%'}) ourStoryTable = None for storyTable in storyTables: storyId = None for a in storyTable.findAll('a'): if not a.get('href').startswith('read.php?storyid='): continue storyId = a.get('href')[len('read.php?storyid='):] storyId = storyId[:storyId.find('&')] storyId = str(int(storyId)) if storyId is None: continue if storyId != str(fic.localId): continue ourStoryTable = storyTable if ourStoryTable is None: raise Exception(f'unable to find story table: {fic.localId} {authorUrl}') trs = ourStoryTable.findAll('tr') if len(trs) != 3: raise Exception( f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}' ) fic.description = trs[1].find('td').getText().strip() reviewsMatch = re.search( '\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE ) if reviewsMatch is None: edumpContent(str(trs[0]), 'sugarquill_reviews') raise Exception('could not locate reviews') fic.reviewCount = int(reviewsMatch.group(1).strip()) updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2])) if updatedMatch is None: edumpContent(str(trs[2]), 'sugarquill_updated') raise Exception('could not locate last updated') fic.updated = OilTimestamp( util.parseDateAsUnix(updatedMatch.group(1), fic.fetched) ) if fic.published is None: fic.published = fic.updated fic.wordCount = 0 fic.upsert() for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) ch.title = chapterTitles[cid + 1] ch.cache() ch.upsert() chtml = ch.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? titleHeadings = soup.findAll('h2', {'class': 'title heading'}) if len(titleHeadings) != 1: raise Exception('unable to find ao3 title {}'.format(fic.url)) fic.title = titleHeadings[0].get_text().strip() summaryModules = soup.findAll('div', {'class': 'summary module'}) if len(summaryModules) != 1: prefaceGroups = soup.findAll('div', {'class': 'preface group'}) if len(prefaceGroups) == 1: summaryModules = prefaceGroups[0].findAll( 'div', {'class': 'summary module'} ) if len(summaryModules) == 1: summaryBq = summaryModules[0].find('blockquote') fic.description = summaryBq.decode_contents(formatter='html').strip() elif fic.description is None: fic.description = "{no summary}" # raise Exception('unable to find ao3 summary {}'.format(fic.localId)) fic.ageRating = '<unkown>' # TODO: error handling cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip() ps = cText.split('/') completedChapters = int(ps[0]) totalChapters = None if ps[1] == '?' else int(ps[1]) fic.chapterCount = completedChapters wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip() fic.wordCount = int(wText) fic.reviewCount = 0 fic.favoriteCount = 0 kDefinition = soup.find('dd', {'class': 'kudos'}) if kDefinition is not None: kText = ' '.join(kDefinition.contents).strip() fic.favoriteCount = int(kText) fic.followCount = 0 pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip() publishedUts = util.parseDateAsUnix(pText, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published if fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.ficStatus = FicStatus.ongoing # TODO chapter/chapters? if totalChapters is None or completedChapters < totalChapters: fic.ficStatus = FicStatus.ongoing statusDt = soup.find('dt', {'class': 'status'}) if statusDt is not None: if statusDt.contents[0] == 'Completed:': fic.ficStatus = FicStatus.complete cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(cText, fic.fetched) fic.updated = OilTimestamp(updatedUts) elif statusDt.contents[0] == 'Updated:': fic.ficStatus = FicStatus.ongoing uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(uText, fic.fetched) fic.updated = OilTimestamp(updatedUts) else: raise Exception('unkown status: {}'.format(statusDt.contents[0])) byline = soup.find('h3', {'class': 'byline heading'}) authorLink = byline.find('a') if authorLink is None: if fic.authorId is not None and len(fic.getAuthorName()) > 0: pass # updated author to anon, don't make changes else: # first loaded after it was already set to anonymous authorUrl = '' author = 'Anonymous' authorId = 'Anonymous' self.setAuthor(fic, author, authorUrl, authorId) else: authorUrl = authorLink.get('href') author = ' '.join(byline.find('a').contents) authorId = author # map pseudo to real? self.setAuthor(fic, author, authorUrl, authorId) if fic.chapterCount > 1: fic.upsert() localChapterIdSelect = soup.find(id='selected_id').findAll('option') # note: ao3 sometimes says there are less chapters than there really # are, possibly due to caching on their end. We just ensure there's _at # least_ chapterCount chapters, then fetch whatever the dropdown tells # us to if len(localChapterIdSelect) > fic.chapterCount: fic.chapterCount = len(localChapterIdSelect) fic.upsert() if len(localChapterIdSelect) != fic.chapterCount: raise Exception('mismatching localChapterId count?') for cid in range(1, fic.chapterCount + 1): chap = fic.chapter(cid) chap.url = '{}{}/chapters/{}?view_adult=true'.format( self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value') ) chap.localChapterId = localChapterIdSelect[cid - 1].get('value') chap.title = localChapterIdSelect[cid - 1].getText().strip() if chap.title is not None: chap.title = util.cleanChapterTitle(chap.title, cid) chap.upsert() fandomDd = soup.find('dd', {'class': 'fandom tags'}) if fandomDd is not None: fandomTags = fandomDd.findAll('a', {'class': 'tag'}) for ft in fandomTags: originalF = ft.contents[0].strip() f = originalF.lower() # TODO: this seriously needs reworked if ( (f.startswith("harry potter ") and f.endswith("rowling")) or f == 'harry potter - fandom' or f == 'fantastic beasts and where to find them (movies)' or f == 'harry potter next generation - fandom' ): fic.add(Fandom.define('Harry Potter')) elif ( f == 'sherlock - fandom' or f == 'sherlock (tv)' or f == 'sherlock holmes & related fandoms' or f == 'sherlock holmes - arthur conan doyle' or f == 'sherlock holmes (downey films)' ): fic.add(Fandom.define('Sherlock Holmes')) elif f == 'furry (fandom)' or f == 'harry - fandom': continue # skip elif f == 'fleurmione - fandom': continue # skip elif f == 'skyfall (2012) - fandom': fic.add(Fandom.define('James Bond')) elif f == 'orphan black (tv)': fic.add(Fandom.define('Orphan Black')) elif ( f == 'naruto' or f == 'naruto shippuden' or f == 'naruto shippuuden - fandom' ): fic.add(Fandom.define('Naruto')) elif f == 'naruto/harry potter': fic.add(Fandom.define('Naruto')) fic.add(Fandom.define('Harry Potter')) elif f == 'bleach': fic.add(Fandom.define('Bleach')) elif ( f == 'iron man (movies)' or f == 'iron man - all media types' or f == 'iron man (comic)' or f == 'iron man - fandom' or f == 'iron man (comics)' ): fic.add(Fandom.define('Iron Man')) elif ( f == 'the avengers (marvel) - all media types' or f == 'the avengers (marvel movies)' or f == 'the avengers - ambiguous fandom' or f == 'the avengers (2012)' or f == 'the avengers' or f == 'avengers (marvel) - all media types' or f == 'marvel avengers movies universe' or f == 'avengers' ): fic.add(Fandom.define('Avengers')) elif f == 'marvel 616': fic.add(Fandom.define('Marvel')) fic.add(Fandom.define('Marvel 616')) elif f == 'thor (movies)' or f == 'thor - all media types': fic.add(Fandom.define('Thor')) elif ( f == 'captain america (movies)' or f == 'captain america - all media types' or f == 'captain america (comics)' ): fic.add(Fandom.define('Captain America')) elif ( f == 'avatar: the last airbender' or f == 'avatar: legend of korra' or f == 'avatar the last airbender - fandom' ): fic.add(Fandom.define('Avatar')) elif f == 'original work': fic.add(Fandom.define('Original Work')) elif f == 'stargate atlantis': fic.add(Fandom.define('Stargate Atlantis')) elif f == 'stargate sg-1': fic.add(Fandom.define('Stargate SG-1')) elif f == 'stargate - all series': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Stargate SG-1')) elif f == 'agents of s.h.i.e.l.d. (tv)': fic.add(Fandom.define('Avengers')) elif f == 'supernatural': fic.add(Fandom.define('Supernatural')) elif f == 'teen wolf (tv)': fic.add(Fandom.define('Teen Wolf')) elif f == 'grimm (tv)': fic.add(Fandom.define('Grimm')) elif ( f == 'the amazing spider-man (movies - webb)' or f == 'spider-man - all media types' or f == 'spider-man: homecoming (2017)' ): fic.add(Fandom.define('Spiderman')) elif ( f == 'x-men - all media types' or f == 'x-men (movieverse)' or f == 'x-men (comicverse)' ): fic.add(Fandom.define('X-Men')) elif ( f == 'lord of the rings - j. r. r. tolkien' or f == 'the lord of the rings - j. r. r. tolkien' ): fic.add(Fandom.define('Lord of the Rings')) elif ( f == 'crisis core: final fantasy vii' or f == 'compilation of final fantasy vii' or f == 'final fantasy vii' ): fic.add(Fandom.define('Final Fantasy VII')) fic.add(Fandom.define('Final Fantasy')) elif f == 'sen to chihiro no kamikakushi | spirited away': fic.add(Fandom.define('Spirited Away')) elif f == 'howl no ugoku shiro | howl\'s moving castle': fic.add(Fandom.define('Howl\'s Moving Castle')) elif f == 'rise of the guardians (2012)': fic.add(Fandom.define('Rise of the Guardians')) elif ( f == 'doctor who' or f == 'doctor who (2005)' or f == 'doctor who & related fandoms' ): fic.add(Fandom.define('Doctor Who')) elif f == 'daredevil (tv)' or f == 'daredevil (comics)': fic.add(Fandom.define('DareDevil')) elif f == 'labyrinth (1986)': fic.add(Fandom.define('Labyrinth')) elif f == 'gravity falls': fic.add(Fandom.define('Gravity Falls')) elif f == 'once upon a time (tv)': fic.add(Fandom.define('Once Upon a Time')) elif f == 'doctor strange (comics)': fic.add(Fandom.define('Doctor Strange')) elif f == 'the sentinel': fic.add(Fandom.define('The Sentinel')) elif f == 'teen titans (animated series)': fic.add(Fandom.define('Teen Titans')) elif ( f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)' or f == 'dc extended universe' or f == 'dc animated universe' ): fic.add(Fandom.define('DC')) elif f == 'vampire hunter d': fic.add(Fandom.define('Vampire Hunter D')) elif f == 'homestuck': fic.add(Fandom.define('Homestuck')) elif f == 'one piece': fic.add(Fandom.define('One Piece')) elif f == 'batman (movies - nolan)': fic.add(Fandom.define('Batman')) elif f == 'die hard (movies)': fic.add(Fandom.define('Die Hard')) elif f == 'discworld - terry pratchett': fic.add(Fandom.define('Discworld')) elif f == 'gossip girl': fic.add(Fandom.define('Gossip Girl')) elif ( f == 'a song of ice and fire - george r. r. martin' or f == 'a song of ice and fire & related fandoms' ): fic.add(Fandom.define('A Song of Ice and Fire')) elif f == 'supergirl (tv 2015)': fic.add(Fandom.define('Supergirl')) elif f == 'merlin (tv)': fic.add(Fandom.define('Merlin')) elif f == 'star trek': fic.add(Fandom.define('Star Trek')) elif f == 'steven universe (cartoon)': fic.add(Fandom.define('Steven Universe')) elif f == 'hellsing': fic.add(Fandom.define('Hellsing')) elif f == 'the breaker': fic.add(Fandom.define('The Breaker')) elif f == 'smallville': fic.add(Fandom.define('Smallville')) elif f == '베리타스 | veritas (manhwa)': fic.add(Fandom.define('Veritas (manhwa)')) elif f == 'guardians of childhood - william joyce': fic.add(Fandom.define('Guardians of Childhood')) elif f == 'person of interest (tv)': fic.add(Fandom.define('Person of Interest')) elif f == 'james bond (craig movies)': fic.add(Fandom.define('James Bond')) elif f == 'the bourne legacy (2012)': fic.add(Fandom.define('Jason Bourne')) elif f == 'numb3rs': fic.add(Fandom.define('Numb3rs')) elif f == 'temeraire - naomi novik': fic.add(Fandom.define('Temeraire')) elif f == 'twilight series - stephenie meyer': fic.add(Fandom.define('Twilight')) elif f == 'dungeons and dragons - fandom': fic.add(Fandom.define('Dungeons and Dragons')) elif f == 'american horror story' or f == 'american horror story: cult': fic.add(Fandom.define('American Horror Story')) elif ( f == 'worm (web serial novel)' or f == 'worm - wildbow' or f == 'parahumans series - wildbow' or f == 'worm (web serial) | wildbow' or f == 'worm - fandom' or f == 'parahumans - fandom' or f == 'worm (parahumans)' or f == 'worm (web serial)' or f == 'worm | parahumans' or f == 'worm (web novel)' ): fic.add(Fandom.define('Worm')) elif f == 'toaru kagaku no railgun | a certain scientific railgun': fic.add(Fandom.define('A Certain Scientific Railgun')) elif f == 'toaru majutsu no index | a certain magical index': fic.add(Fandom.define('A Certain Magical Index')) elif f == 'cthulhu mythos - h. p. lovecraft': fic.add(Fandom.define('Cthulhu')) elif f == 'transformers - all media types': fic.add(Fandom.define('Transformers')) elif f == 'destiny (video game)': fic.add(Fandom.define('Destiny')) elif f == 'fandom - fandom' or f == 'meta - fandom': pass # >_> elif f == 'house m.d.': fic.add(Fandom.define('House, M.D.')) elif f == 'the hobbit (jackson movies)': fic.add(Fandom.define('The Hobbit')) elif f == 'doctor strange (2016)': fic.add(Fandom.define('Doctor Strange')) elif f == 'arrow (tv 2012)': fic.add(Fandom.define('Arrow')) elif f == 'the flash (tv 2014)': fic.add(Fandom.define('Flash')) elif f == 'senki zesshou symphogear': fic.add(Fandom.define('Symphogear')) elif ( f == 'fullmetal alchemist: brotherhood & manga' or f == 'fullmetal alchemist - all media types' or f == 'fullmetal alchemist (anime 2003)' ): fic.add(Fandom.define('Fullmetal Alchemist')) elif ( f == 'star wars - all media types' or f == 'star wars episode vii: the force awakens (2015)' or f == 'star wars prequel trilogy' ): fic.add(Fandom.define('Star Wars')) elif ( f == 'guardians of the galaxy (2014)' or f == 'guardians of the galaxy - all media types' or f == 'guardians of the galaxy (movies)' ): fic.add(Fandom.define('Guardians of the Galaxy')) elif f == 'ant man (2015)' or f == 'ant-man (movies)': fic.add(Fandom.define('Ant Man')) elif f == 'the defenders (marvel tv)': fic.add(Fandom.define('The Defenders')) elif f == 'elementary (tv)': fic.add(Fandom.define('Elementary')) elif f == 'good omens - neil gaiman & terry pratchett': fic.add(Fandom.define('Good Omens')) elif f == 'danny phantom': fic.add(Fandom.define('Danny Phantom')) elif f == 'katekyou hitman reborn!': fic.add(Fandom.define('Katekyo Hitman Reborn!')) elif f == 'welcome to night vale': fic.add(Fandom.define('Welcome to Night Vale')) elif f == 'ncis': fic.add(Fandom.define('NCIS')) elif f == 'torchwood': fic.add(Fandom.define('Torchwood')) elif f == 'magic: the gathering': fic.add(Fandom.define('Magic: The Gathering')) elif f == 'overwatch (video game)': fic.add(Fandom.define('Overwatch')) elif f == 'detroit: become human (video game)': fic.add(Fandom.define('Detroit: Become Human')) elif f == 'greek and roman mythology': pass elif f == 'life is strange (video game)': fic.add(Fandom.define('life is strange (video game)')) elif f == 'akatsuki no yona | yona of the dawn': fic.add(Fandom.define('Yona of the Dawn')) elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia': fic.add(Fandom.define('My Hero Academia')) elif f == 'voltron: legendary defender': fic.add(Fandom.define('Voltron')) elif f == 'selfie (tv)': fic.add(Fandom.define('Selfie')) elif f == 'suits (tv)': fic.add(Fandom.define('Suits')) elif f == 'fruits basket': fic.add(Fandom.define('Fruits Basket')) elif f == 'hetalia: axis powers': fic.add(Fandom.define('Hetalia: Axis Powers')) elif f == 'carmilla (web series)': fic.add(Fandom.define('Carmilla')) elif f == 'the dresden files - jim butcher': fic.add(Fandom.define('Dresden Files')) elif f == 'girl genius': fic.add(Fandom.define('Girl Genius')) elif f == 'unspecified fandom': pass # TODO? elif f == 'nightwing (comics)': fic.add(Fandom.define('Nightwing')) elif f == 'books of the raksura - martha wells': fic.add(Fandom.define('Books of the Raksura')) elif f == 'fall of ile-rien - martha wells': fic.add(Fandom.define('Fall of Ile-Rien')) elif f == 'vorkosigan saga - lois mcmaster bujold': fic.add(Fandom.define('Vorkosigan Saga')) elif ( f == 'highlander: the series' or f == 'highlander - all media types' ): fic.add(Fandom.define('Highlander')) elif f == 'yoroiden samurai troopers | ronin warriors': fic.add(Fandom.define('Ronin Warriors')) elif f == 'hockey rpf': fic.add(Fandom.define('Hockey RPF')) elif f == 'pacific rim (2013)': fic.add(Fandom.define('Pacific Rim')) elif f == 'enchanted forest chronicles - patricia wrede': fic.add(Fandom.define('Enchanted Forest Chronicles')) elif f == 'tortall - tamora pierce': fic.add(Fandom.define('Tortall')) elif f == 'protector of the small - tamora pierce': fic.add(Fandom.define('Protector of the Small')) elif f == 'leverage': fic.add(Fandom.define('Leverage')) elif f == 'valdemar series - mercedes lackey': fic.add(Fandom.define('Valdemar Series')) elif ( f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense' ): fic.add(Fandom.define('B.P.R.D.')) elif f == 'hellboy (comic)': fic.add(Fandom.define('Hellboy')) elif f == 'sga/avatar': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Avatar')) elif f == 'annihilation (2018 garland)': fic.add(Fandom.define('Annihilation')) elif f == 'craft sequence - max gladstone': fic.add(Fandom.define('Craft Sequence')) elif f == 'the good place (tv)': fic.add(Fandom.define('The Good Place')) elif f == 'jessica jones (tv)': fic.add(Fandom.define('Jessica Jones')) elif f == 'mad max series (movies)': fic.add(Fandom.define('Mad Max')) elif f == 'american gods (tv)': fic.add(Fandom.define('American Gods')) elif f == 'terminator: the sarah connor chronicles': fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles')) fic.add(Fandom.define('Terminator')) elif f == 'wolf 359 (radio)': fic.add(Fandom.define('Wolf 359')) elif f == 'shadowrun: dragonfall': fic.add(Fandom.define('Shadowrun')) elif f == 'ars paradoxica (podcast)': fic.add(Fandom.define('Ars Paradoxica')) elif f == 'love is strange - fandom': fic.add(Fandom.define('Love is Strange')) elif f == 'dune - all media types': fic.add(Fandom.define('Dune')) elif f == 'dragon age: origins': fic.add(Fandom.define('Dragon Age: Origins')) elif f == 'game of thrones (tv)': fic.add(Fandom.define('Game of Thrones')) elif f == 'chronicles of amber - roger zelazny': fic.add(Fandom.define('Chronicles of Amber')) elif f == 'the southern reach trilogy - jeff vandermeer': fic.add(Fandom.define('The Southern Reach Trilogy')) elif f == 'continuum (tv)': fic.add(Fandom.define('Continuum')) elif f == 'mage: the ascension': fic.add(Fandom.define('Mage: The Ascension')) elif f == 'the good wife (tv)' or f == 'good wife (tv)': fic.add(Fandom.define('The Good Wife')) elif f == 'alliance-union - c. j. cherryh': fic.add(Fandom.define('Alliance-Union')) elif f == 'indexing - seanan mcguire': fic.add(Fandom.define('Indexing')) elif f == 'ultraviolet (tv)': fic.add(Fandom.define('Ultraviolet')) elif f == 'veronica mars (tv)': fic.add(Fandom.define('Veronica Mars')) elif f == 'secret circle (tv)': fic.add(Fandom.define('Secret Circle')) elif f == 'mahou shoujo madoka magika | puella magi madoka magica': fic.add(Fandom.define('Madoka Magica')) elif f == 'agent carter (tv)': fic.add(Fandom.define('Agent Carter')) elif f == 'dracula & related fandoms': fic.add(Fandom.define('Dracula')) elif f == 'dragon ball': fic.add(Fandom.define('Dragon Ball')) elif f == 'mass effect - all media types': fic.add(Fandom.define('Mass Effect')) elif f == 'firefly' or f == 'serenity (2005)': fic.add(Fandom.define('Firefly')) else: anyHere = False global ao3FandomsMap for fm in ao3FandomsMap: here = False for uf in fm[0]: if f == uf.lower().strip(): here = True break if not here: continue anyHere = True for mf in fm[1]: fic.add(Fandom.define(mf)) if not anyHere: util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}') #raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF)) ourDoms = fic.fandoms() # we have a canonical fandom, try to find our characters if len(ourDoms) == 1: relationshipDd = soup.find('dd', {'class': 'relationship tags'}) if relationshipDd is not None: relationshipTags = relationshipDd.findAll('a', {'class': 'tag'}) for rt in relationshipTags: r = rt.contents[0] chars = r.split('/') if len(chars) > 8: # TODO: sometimes more? raise Exception('unable to parse relationship: {}'.format(r)) for char in chars: fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype)) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: raise Exception('FIXME TODO fanfics me format has changed') from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html5lib') ficHead = soup.find('div', {'class': 'FicHead'}) titleH1 = ficHead.find('h1') fic.title = titleH1.getText().strip() fandoms: List[str] = [] trs = ficHead.findAll('div', {'class': 'tr'}) author = None for tr in trs: divTitle = tr.find('div', {'class': 'title'}) divContent = tr.find('div', {'class': 'content'}) t = str(divTitle.getText()).strip() v = str(divContent.getText()).strip() if t == 'Автор:': author = v elif t == 'Фандом:': if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling': fandoms += ['Harry Potter'] else: raise Exception('unknown fandom: ' + v) elif t == 'Статус:': if v == 'В процессе': fic.ficStatus = FicStatus.ongoing elif v == 'Закончен': fic.ficStatus = FicStatus.complete else: raise Exception('unknown write status: ' + v) elif t == 'Опубликован:': fic.published = self.parseRussianDate(v) elif t == 'Изменен:': fic.updated = self.parseRussianDate(v) elif t == 'Ссылка:': src = v # source archive url elif t == 'Читателей:': fic.followCount = int(v) elif t == 'Персонажи:': # characters, parse relationship? pass elif t == 'Рейтинг:': fic.ageRating = v elif t == 'Предупреждения:': # warnings? pass else: raise Exception('unknown metadata: ' + t) # TODO? assert (author is not None) authorUrl = author authorId = author self.setAuthor(fic, author, authorUrl, authorId) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.url is None: fic.url = self.constructUrl(fic.localId) summaryTextDiv = soup.find('div', {'class': 'summary_text'}) if summaryTextDiv is None: summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'}) fic.description = summaryTextDiv.getText() # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 if fic.followCount is None: fic.followCount = 0 fic.ageRating = 'M' ficContentsUl = soup.find('ul', {'class': 'FicContents'}) chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'}) fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) # try to get it out of current blob first if chapter.html() is None: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapter.setHtml( '<div class="ReadContent">' + str(contentDiv) + '</div>' ) if chapter.title is None or len(chapter.title) < 1: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() # fallback to scraping it directly if chapter.html() is None: cdata = scrape.softScrape(chapter.url) assert (cdata is not None) chapter.setHtml(self.extractContent(fic, cdata)) csoup = BeautifulSoup(cdata, 'html5lib') contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)}) chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount += len(chapter.cachedContent().split()) fic.wordCount = wordCount for fandom in fandoms: fic.add(Fandom.define(fandom)) return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? w95tables = soup.findAll('table', {'width': '95%'}) if len(w95tables) != 3: raise Exception('wrong number of w95 tables: {}'.format( len(w95tables))) ficInfoTable = w95tables[0] ficTitleH3 = ficInfoTable.find('h3') fic.title = ficTitleH3.get_text().strip() authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html) if authorUrlMatch is None: raise Exception('could not locate author url') author = authorUrlMatch.group(2) authorId = authorUrlMatch.group(1) authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId self.setAuthor(fic, author, authorUrl, authorId) # TODO: this may miss multiline summaries :( summaryMatch = re.search( '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE) if summaryMatch is None: edumpContent(html, 'siye_summary') raise Exception('could not locate summary') # alternatively: fic.description = "{no summary}" ? fic.description = summaryMatch.group(1).strip() fic.ageRating = '<unkown>' ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html) if ageRatingMatch is not None: fic.ageRating = ageRatingMatch.group(1).strip() maxChapter = 0 baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId) singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format( fic.localId) isSingleChapterFic = False allAs = soup.find_all('a') for a in allAs: href = a.get('href') if href is None: continue if not href.startswith(baseChapterHref): continue if href.startswith(singleChapterHref): isSingleChapterFic = True maxChapter = max(1, maxChapter) continue cid = int(href[len(baseChapterHref):]) maxChapter = max(cid, maxChapter) fic.chapterCount = maxChapter fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing if html.find('Story is Complete'): fic.ficStatus = FicStatus.complete updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)') minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched) maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched) for (year, month, day) in re.findall(updatedOnPattern, html): date = '{}/{}/{}'.format(year, month, day) dt = util.parseDateAsUnix(date, fic.fetched) minUpdate = min(minUpdate, dt) maxUpdate = max(maxUpdate, dt) if fic.published is None or fic.published.toUTS() > minUpdate: fic.published = OilTimestamp(minUpdate) if fic.updated is None or fic.updated.toUTS() < maxUpdate: fic.updated = OilTimestamp(maxUpdate) if fic.updated < fic.published: fic.updated = fic.published fic.wordCount = 0 wordsPattern = re.compile('(\d+) words') for (words) in re.findall(wordsPattern, html): fic.wordCount += int(words) if fic.wordCount == 0 and isSingleChapterFic: try: fic.upsert() ch1 = fic.chapter(1) ch1.cache() chtml = ch1.html() if chtml is not None: fic.wordCount = len(chtml.split()) except: pass fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup archive = fic.localId.split('/')[0] storyNo = fic.localId.split('/')[1] soup = BeautifulSoup(wwwHtml, 'html5lib') titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)}) fic.title = str(titleH2.getText()) membersUrl = 'http://members.adult-fanfiction.org/profile.php?no=' memberLink = soup.find( lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href") is not None and (t.get("href").startswith(membersUrl)))) author = memberLink.getText() authorId = memberLink.get('href')[len(membersUrl):] authorUrl = memberLink.get('href') self.setAuthor(fic, author, authorUrl, authorId) # TODO fic.ficStatus = FicStatus.ongoing fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1) # TODO: description is on search page if fic.description is None: fic.description = 'TODO: on the search page?' # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO if fic.published is None: fic.published = OilTimestamp.now() if fic.updated is None: fic.updated = fic.published chapterDropdown = soup.find('div', {'class': 'dropdown-content'}) chapterLinks = chapterDropdown.findAll('a') oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapterContent = scrape.softScrape( self.constructUrl(fic.localId, cid)) chapter = fic.chapter(cid) if chapterContent is not None: chapter.setHtml(chapterContent) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) chapter.title = chapterLinks[cid - 1].getText().strip() if chapter.title is not None: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() if chapterContent is not None: wordCount += len(chapterContent.split()) fic.wordCount = wordCount if oldChapterCount is not None and oldChapterCount < fic.chapterCount: fic.updated = OilTimestamp.now() # TODO fic.upsert() storyUrl = self.constructUrl(fic.localId, chapterId=None) # more metadata from search page searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title={}&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author, fic.title.replace(' ', '+')) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) # fallback to pure author search if storyUrl not in metas: searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title=&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) if storyUrl not in metas: raise Exception('cannot find search metadata') meta = metas[storyUrl] assert (meta.published is not None and meta.updated is not None) fic.published = OilTimestamp(meta.published) fic.updated = OilTimestamp(meta.updated) fic.reviewCount = meta.reviewCount fic.favoriteCount = meta.views # TODO fic.ficStatus = meta.ficStatus assert (meta.description is not None) fic.description = meta.description assert (fic.description is not None) if len(meta.tags) > 0: fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags) for fan in meta.fandoms: fic.add(Fandom.define(fan)) return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? pagetitle = soup.find(id='pagetitle') aTags = pagetitle.findAll('a') author = None for a in aTags: href = a.get('href') if href.startswith('viewstory'): fic.title = a.contents[0].strip() elif href.startswith('viewuser.php?uid='): author = a.contents[0] authorUrl = self.baseUrl + href authorId = str(int(href[len('viewuser.php?uid='):])) self.setAuthor(fic, author, authorUrl, authorId) if fic.title is None: raise Exception('unable to find title') if author is None: raise Exception('unable to find author') lines = html.replace('\r', '\n').replace('<', '\n<').split('\n') inDescription = False description = '' for line in lines: cur = line.strip() if cur.find('!-- SUMMARY START --') != -1: inDescription = True elif cur.find('!-- SUMMARY END --') != -1: inDescription = False if inDescription == True: description += cur + '\n' fic.description = description fic.ageRating = '<unkown>' infoBlock = None infoText = None blocks = soup.findAll('div', {'class': 'block'}) for block in blocks: title = block.find('div', {'class': 'title'}) if title is None: continue if title.contents[0] != 'Story Information': continue infoBlock = block infoText = block.get_text() break else: raise Exception('unable to find info text') matcher = RegexMatcher( infoText, { 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Word count:\s+(\S+)', int), }) matcher.matchAll(fic) sortDiv = soup.find(id='sort') match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text()) if match is not None: fic.reviewCount = int(match.group(1).replace(',', '')) else: fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 infoBlockHtml = str(infoBlock) match = re.search( '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->', infoBlockHtml) if match is not None: publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.published = OilTimestamp(publishedUts) match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->', infoBlockHtml) if match is not None: updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.updated is None: fic.updated = fic.published match = re.search('Completed:\s+(\S+)', infoText) if match is not None: complete = match.group(1) if complete == 'No': fic.ficStatus = FicStatus.ongoing elif complete == 'Yes': fic.ficStatus = FicStatus.complete else: raise Exception('unknown complete value: {}'.format(complete)) match = re.search('Crossovers', infoText) if match is not None: pass # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url)) else: # otherwise not a crossover and just harry potter fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'}) if len(storyMainInfo) != 1: raise Exception('unable to find main story info') storyMainInfo = storyMainInfo[0] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=" for a in soup.findAll('a'): href = a.get('href') if (not href.startswith(disclaimerJs) and href != '?psid={}'.format(fic.localId)): continue fic.title = a.getText() break else: raise Exception('error: unable to find title') fic.url = self.constructUrl(fic.localId) storySummaryTable = soup.findAll('table', {'class': 'storysummary'}) if len(storySummaryTable) != 1: raise Exception('cannot find story summary table') storySummaryTable = storySummaryTable[0] fic.description = (storySummaryTable.getText().strip()) if fic.description is None: raise Exception('error: unable to find description') # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 text = storyMainInfo.getText().replace('\xa0', ' ') matcher = RegexMatcher( text, { 'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str), 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\d+)', int), 'reviewCount': ('Story Reviews:\s*(\d+)', int), 'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int), 'updated': ('Last Updated:\s+(\S+)', str), 'published': ('First Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)', text) if match is None: raise Exception('cannot find write status') status = match.group(1) if status == 'Completed': fic.ficStatus = FicStatus.complete elif status == 'Work In Progress': fic.ficStatus = FicStatus.ongoing # should these be abandoned? elif status == 'Abandoned': fic.ficStatus = FicStatus.abandoned else: raise Exception('unknown status: {}'.format(status)) for a in soup.findAll('a'): a_href = a.get('href') if a_href.startswith('viewuser.php?showuid='): author = a.get_text() authorUrl = self.baseUrl + '/' + a_href authorId = a_href[len('viewuser.php?showuid='):] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: chars/pairings? fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicText = 'Story Not FoundUnable to locate story. Code 1.' soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: if gui_warning.get_text() == deletedFicText: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(\S+)', text) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(1) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}'.format(status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = preStoryLinks.find_all('a') for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in fictionPressCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a regular genre in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in fictionPressCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) # ensure it's in our whitelist if hrefParts[2] not in fictionPressGenres: util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}') continue fic.add(Fandom.define(hrefParts[2])) continue util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}') continue fic.upsert() chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1) elif fic.chapterCount == 1 and cid == 0: ch.title = fic.title ch.upsert() return fic