def v0_fic_all(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() fic = fics[0] if fic.chapterCount is None: print(f'err: fic has no chapter count: {fic.id}') return Err.urlId_not_found.get() ficChapters = { fc.chapterId: fc for fc in FicChapter.select({'ficId': fic.id}) } chapters = {} for cid in range(1, fic.chapterCount + 1): if cid not in ficChapters: return Err.cid_not_found.get({'arg': f'{fic.id}/{cid}'}) chapter = ficChapters[cid] cres = chapter.toJSONable() try: content = cres['content'] if content is not None: content = util.decompress(content) content = scrape.decodeRequest(content, f'{fic.id}/{cid}') content = cleanHtml(content) if content != cleanHtml(content): print( f'v0_fic_all: {fic.id}/{cid} did not round-trip through cleanHtml' ) cres['content'] = content chapters[cid] = cres except: pass res = fic.toJSONable() return Err.ok({'info': res, 'chapters': chapters})
def get(self, localId: str) -> Fic: existing = Fic.select({'sourceId': self.ftype, 'localId': localId}) if len(existing) == 1: return existing[0] fic = Fic.new() fic.sourceId = self.ftype fic.localId = localId fic.created = OilTimestamp.now() return self.create(fic)
def getFromZList(self, localId: int, ts: int, html: str) -> Fic: fic = None existing = Fic.select({'sourceId': self.ftype, 'localId': str(localId)}) if len(existing) != 1: fic = Fic.new() fic.sourceId = self.ftype fic.localId = str(localId) fic.created = OilTimestamp.now() else: fic = existing[0] return self.createFromZList(fic, ts, html)
def get(self, localId: str) -> Fic: existing = Fic.select({'sourceId': self.ftype, 'localId': localId}) if len(existing) == 1: return existing[0] if not self.cacheable: raise Exception('cannot cache {}/{}'.format(localId, self.ftype)) fic = Fic.new() fic.sourceId = self.ftype fic.localId = localId fic.created = OilTimestamp.now() return self.create(fic)
def tryParseUrl(self, url: str) -> Optional[FicId]: # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: return FicId( FicType(fic.sourceId), fic.localId, chaps[0].chapterId, False ) fics = Fic.select({'url': url}) if len(fics) == 1: return FicId(FicType(fics[0].sourceId), fics[0].localId) raise NotImplementedError()
def v0_cache(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() fic = fics[0] if fic.chapterCount is None: print(f'err: fic has no chapter count: {fic.id}') return Err.urlId_not_found.get() for cid in range(1, fic.chapterCount + 1): try: chapter = fic.chapter(cid) chapter.cache() except Exception as e: return Err.failed_to_cache_cid.get({'arg': f'{fic.id}/{cid}'}) return Err.ok(fic.toJSONable())
def populateManualTemplate(url, chapterUrls, author): existingManual = Fic.select({'type': FicType.manual}) lid = len(existingManual) + 1 manRename = {'id': None} manDefaults = { 'fandoms': [], 'characters': [], 'tags': [], 'genres': [], 'authorUrl': url, 'author': author, 'authorId': author, 'ageRating': 'M', 'language': 'English', 'favorites': 0, 'follows': 0, 'reviews': 0, 'url': url, 'lastUrl': url, 'type': FicType.manual, 'lid': lid, 'ficStatus': Status.complete, 'wordCount': -1, 'description': 'FILL IN MY DESCRIPTION', 'title': 'FILL IN MY TITLE', 'published': 'FILL IN MY PUBLISHED DATE', 'updated': 'FILL IN MY UPDATED DATE', 'added': int(time.time()), 'fetched': int(time.time()) } fic = Fic.new().__dict__ fic = inflateObject(fic, manRename, manDefaults) fic['chapters'] = {} fic['chapterCount'] = len(chapterUrls) for cid in range(1, len(chapterUrls) + 1): fic['chapters'][cid] = { 'lastModified': int(time.time()), 'status': Status.ongoing, 'fetched': int(time.time()), 'url': chapterUrls[cid - 1], } return fic
def tryParseUrl(self, url: str) -> Optional[FicId]: if not url.startswith(self.baseUrl): return None # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: ftype = FicType(fic.sourceId) return FicId(ftype, fic.localId, chaps[0].chapterId, False) fics = Fic.select({'url': url}) if len(fics) == 1: ftype = FicType(fics[0].sourceId) return FicId(ftype, fics[0].localId) leftover = url[len(self.baseUrl):] if not leftover.endswith('.html'): return None ps = leftover.split('/') if len(ps) != 3 or ps[0] != 'authors': return None author = ps[1] storyId = ps[2] suffixes = ['01a.html', '.html'] for suffix in suffixes: if storyId.endswith(suffix): storyId = storyId[:-len(suffix)] # note: seems to be safe to lowercase these lid = (author + '/' + storyId).lower() #print(lid) # make lid author/story ? # TODO: we need some sort of local lid mapping... raise NotImplementedError()
def dumpDB(): data = {} fandomMap = {f.id: f for f in Fandom.select()} characterMap = {c.id: c for c in Character.select()} genreMap = {g.id: g for g in Genre.select()} tagMap = {t.id: t for t in Tag.select()} data['fandoms'] = [fandomMap[k].name for k in fandomMap] data['characters'] = [ { 'name': characterMap[k].name, 'fandom': fandomMap[characterMap[k].fandom_id].name } for k in characterMap ] data['genres'] = [genreMap[k].name for k in genreMap] data['tags'] = [tagMap[k].name for k in tagMap] data['fics'] = {} frename = {'id': None, 'chapters': 'chapterCount'} crename = { 'id': None, 'ficId': None, 'cid': None, 'raw': None, 'fic': None, 'lastLine': None } cdefaults = { 'line': 0, 'subLine': 0, 'notes': None, 'status': Status.ongoing, 'fetched': None, 'url': None } fics = Fic.select() for fic in fics: k = '{}/{}'.format(fic.type, fic.localId) o = fic.__dict__.copy() o = deflateObject(o, frename) o['fandoms'] = [f.name for f in fic.fandoms()] o['characters'] = [ { 'name': c.name, 'fandom': fandomMap[c.fandom_id].name } for c in fic.characters() ] o['tags'] = [t.name for t in fic.tags()] o['genres'] = [g.name for g in fic.genres()] co = {} ficChapters = FicChapter.select({'ficId': fic.id}) for chapter in ficChapters: here = chapter.__dict__.copy() ffNetUrl = 'https://www.fanfiction.net/s/{}/{}/{}'.format( fic.localId, chapter.chapterId, util.urlTitle(fic.title) ) cdefaults['url'] = ffNetUrl cdefaults['lastModified'] = here['fetched'] here = deflateObject(here, crename, cdefaults) co[chapter.chapterId] = here if chapter.raw is None: continue contentPath = './content/{}/{}/{}/'.format( fic.type, fic.localId, chapter.chapterId ) if not os.path.isdir(contentPath): os.makedirs(contentPath) with open(contentPath + 'content.html', 'w') as f: f.write(chapter.content()) o['chapters'] = co data['fics'][k] = o return data
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicTexts = [ # probably deleted by user 'Story Not FoundUnable to locate story. Code 1.', # probably deleted by admin 'Story Not FoundUnable to locate story. Code 2.', # unknown 'Story Not FoundStory is unavailable for reading. (A)', ] soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: for deletedFicText in deletedFicTexts: if gui_warning.get_text() == deletedFicText: if fic.ficStatus != FicStatus.complete: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) descriptionFound = False for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() descriptionFound = True break if descriptionFound == False: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 # TODO we should match this only on the section following the description matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Rated:.*Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = [] if preStoryLinks is not None: preStoryLinksLinks = preStoryLinks.find_all('a') pendingFandoms: List[Fandom] = [] for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in ffNetFandomCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/ if ( len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers") and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0 ): fIds = [int(hrefParts[2]), int(hrefParts[3])] pendingFandoms += self.handleCrossoverFandom( fic, hrefParts[1], fIds, href ) continue # if it's a regular fandom in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in ffNetFandomCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) pendingFandoms += self.handleFandom(fic, hrefParts[2]) continue util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href)) fic.upsert() poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId}) if len(poss) != 1: raise Exception(f'unable to upsert fic?') fic = poss[0] for pfandom in pendingFandoms: fic.add(pfandom) if fic.chapterCount is None: return fic chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = str(cid) ch.url = self.constructUrl(fic.localId, cid) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid) elif fic.chapterCount == 1 and cid == 1: ch.title = fic.title ch.upsert() metaSpan = profile_top.find('span', {'class': 'xgray'}) if metaSpan is not None: try: res = self.parseFicMetaSpan(metaSpan.decode_contents()) #fic.language = res["language"] # reconstruct fields = [ ('rated', 'Rated: Fiction ZZZ'), ('language', 'Language: ZZZ'), ('genres', 'Genre: ZZZ'), ('characters', 'Characters: ZZZ'), ('reviews', 'Reviews: ZZZ'), ('favorites', 'Favs: ZZZ'), ('follows', 'Follows: ZZZ'), ] rmeta = ' - '.join( [f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res] ) fic.extraMeta = rmeta publishedUts = util.parseDateAsUnix(res['published'], fic.fetched) fic.published = OilTimestamp(publishedUts) fic.updated = fic.published if 'updated' in res: updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.upsert() except Exception as e: util.logMessage( f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}' ) util.logMessage( f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}' ) pass return fic
def __tryParse(ident: str) -> Optional['FicId']: if len(ident.strip()) < 1: return None # strip view-source from potential urls if ident.startswith('view-source:http'): ident = ident[len('view-source:'):] # guess url next if ident.startswith('http'): return FicId.tryParseUrl(ident) # check for link{site}(id) style idents for ftype in adapters: a = adapters[ftype] if a is None: continue if a.botLinkSuffix is None: continue l = 'link{}('.format(a.botLinkSuffix) if ident.startswith(l) and ident.endswith(')'): mid = ident[len(l):-1] if not mid.isnumeric(): return FicId.tryParse(ident) return FicId(ftype, mid, ambiguous=False) # maybe it's an actual story id from store import Fic parts = ident.split('/') if parts[0].isnumeric(): fic = Fic.get((int(parts[0]), )) if fic is not None: fid = fic.fid() if len(parts) == 2 and parts[1].isnumeric(): fid.chapterId = int(parts[1]) fid.ambiguous = False return fid # or maybe it's a url id... potential = Fic.select({'urlId': parts[0]}) if len(potential) == 1: fid = potential[0].fid() if len(parts) == 2 and parts[1].isnumeric(): fid.chapterId = int(parts[1]) fid.ambiguous = False return fid # assume numeric is ffnet if ident.isnumeric(): return FicId(FicType.ff_net, ident) # guess story/chapter on ffnet if len(parts) == 2 and parts[1].isnumeric(): cid = int(parts[1]) return FicId(FicType.ff_net, parts[0], cid, ambiguous=False) # try prepending https protocol if ident.find('://') < 0: ident = 'https://' + ident return FicId.tryParseUrl(ident) # just guess manual adapter return FicId.tryParseFallback(ident)
def v0_fic(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() return Err.ok(fics[0].toJSONable())