def createFromZList(self, fic: Fic, ts: int, data: str) -> Fic: fic.url = self.constructUrl(fic.localId, 1) fic = self.parseZListInfoInto(fic, ts, data) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() return Fic.lookup((fic.id, ))
def get(self, localId: str) -> Fic: existing = Fic.select({'sourceId': self.ftype, 'localId': localId}) if len(existing) == 1: return existing[0] fic = Fic.new() fic.sourceId = self.ftype fic.localId = localId fic.created = OilTimestamp.now() return self.create(fic)
def getCurrentInfo(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) url = self.tocUrl data = scrape.scrape(url) edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'wavesarisen_ec') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def getFromZList(self, localId: int, ts: int, html: str) -> Fic: fic = None existing = Fic.select({'sourceId': self.ftype, 'localId': str(localId)}) if len(existing) != 1: fic = Fic.new() fic.sourceId = self.ftype fic.localId = str(localId) fic.created = OilTimestamp.now() else: fic = existing[0] return self.createFromZList(fic, ts, html)
def get(self, localId: str) -> Fic: existing = Fic.select({'sourceId': self.ftype, 'localId': localId}) if len(existing) == 1: return existing[0] if not self.cacheable: raise Exception('cannot cache {}/{}'.format(localId, self.ftype)) fic = Fic.new() fic.sourceId = self.ftype fic.localId = localId fic.created = OilTimestamp.now() return self.create(fic)
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) edumpContent(data['raw'], 'sugarquill') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def __init__(self, parent: Optional['Hermes'], target: Fic = None): self.parent = parent self.fics = Fic.list() if target is None else Fic.list( {'id': target.id}) self.list = self.fics self.idx = 0 self.filter = '' self.width, self.height = 80, 24 self.msg: Optional[Tuple[int, str]] = None self.__refilter(target) self._userFicCache: Dict[int, UserFic] = {} self._rebuildUserFicCache()
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) time.sleep(self.baseDelay) edumpContent(data['raw'], 'hpffa') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: # TODO: should we try to get the actual url here, including the url safe # version of the title before the lid? Needs done elsewhere in this # adapter as well fic.url = self.baseUrl + 'threads/' + str(fic.localId) # scrape fresh info data = self.scrapeLike(fic.url) fic = self.parseInfoInto(fic, data) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId, 1) # scrape fresh info data = scrape.scrape(fic.url) fic = self.parseInfoInto(fic, data['raw']) fic.insert() chapter = fic.chapter(1) chapter.setHtml(data['raw']) chapter.upsert() return Fic.lookup((fic.id, ))
def tryParseUrl(self, url: str) -> Optional[FicId]: # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: return FicId( FicType(fic.sourceId), fic.localId, chaps[0].chapterId, False ) fics = Fic.select({'url': url}) if len(fics) == 1: return FicId(FicType(fics[0].sourceId), fics[0].localId) raise NotImplementedError()
def getCurrentInfo(self, fic: Fic) -> Fic: fic.url = self.baseUrl + str(fic.localId) url = fic.url.split('?')[0] + '?view_adult=true' # scrape fresh info data = scrape.scrape(url) return self.parseInfoInto(fic, data['raw'])
def v0_fic_all(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() fic = fics[0] if fic.chapterCount is None: print(f'err: fic has no chapter count: {fic.id}') return Err.urlId_not_found.get() ficChapters = { fc.chapterId: fc for fc in FicChapter.select({'ficId': fic.id}) } chapters = {} for cid in range(1, fic.chapterCount + 1): if cid not in ficChapters: return Err.cid_not_found.get({'arg': f'{fic.id}/{cid}'}) chapter = ficChapters[cid] cres = chapter.toJSONable() try: content = cres['content'] if content is not None: content = util.decompress(content) content = scrape.decodeRequest(content, f'{fic.id}/{cid}') content = cleanHtml(content) if content != cleanHtml(content): print( f'v0_fic_all: {fic.id}/{cid} did not round-trip through cleanHtml' ) cres['content'] = content chapters[cid] = cres except: pass res = fic.toJSONable() return Err.ok({'info': res, 'chapters': chapters})
def setAuthor( self, fic: Fic, author: str, authorUrl: str, authorLocalId: str ) -> None: fic.authorId = Author.getId(author, self.ftype) AuthorSource.getId( fic.authorId, self.ftype, author, authorUrl, authorLocalId )
def populateManualTemplate(url, chapterUrls, author): existingManual = Fic.select({'type': FicType.manual}) lid = len(existingManual) + 1 manRename = {'id': None} manDefaults = { 'fandoms': [], 'characters': [], 'tags': [], 'genres': [], 'authorUrl': url, 'author': author, 'authorId': author, 'ageRating': 'M', 'language': 'English', 'favorites': 0, 'follows': 0, 'reviews': 0, 'url': url, 'lastUrl': url, 'type': FicType.manual, 'lid': lid, 'ficStatus': Status.complete, 'wordCount': -1, 'description': 'FILL IN MY DESCRIPTION', 'title': 'FILL IN MY TITLE', 'published': 'FILL IN MY PUBLISHED DATE', 'updated': 'FILL IN MY UPDATED DATE', 'added': int(time.time()), 'fetched': int(time.time()) } fic = Fic.new().__dict__ fic = inflateObject(fic, manRename, manDefaults) fic['chapters'] = {} fic['chapterCount'] = len(chapterUrls) for cid in range(1, len(chapterUrls) + 1): fic['chapters'][cid] = { 'lastModified': int(time.time()), 'status': Status.ongoing, 'fetched': int(time.time()), 'url': chapterUrls[cid - 1], } return fic
def refresh(self) -> None: self.fics = Fic.list() self._rebuildUserFicCache() target = None if self.idx < len(self.list): target = self.list[self.idx] self.__refilter(target) self.pushMessage('refreshed fic list')
def buildUrl(self, chapter: 'FicChapter') -> str: # TODO: do we need these 2 lines or will they always be done by however # FicChapter is created? if chapter.fic is None: chapter.fic = Fic.lookup((chapter.ficId, )) return self.constructUrl( chapter.fic.localId, chapter.chapterId, chapter.fic.title )
def create(self, fic: Fic) -> Fic: fic.url = self.baseUrl + str(fic.localId) # scrape fresh info url = fic.url.split('?')[0] + '?view_adult=true' data = scrape.scrape(url) edumpContent(data['raw'], 'ao3') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data['raw']) chapter.upsert() return Fic.lookup((fic.id, ))
def populateFATemplate(author, storyAbbreviation, chapterCount): url = 'http://www.fictionalley.org/authors/{}/{}.html'.format( author, storyAbbreviation ) lastUrl = url[:-5] + '01.html' if chapterCount == 1: lastUrl = url[:-5] + '01a.html' lid = 1 faRename = {'id': None} faDefaults = { 'fandoms': ['Harry Potter'], 'characters': [], 'tags': [], 'genres': [], 'authorUrl': 'http://www.fictionalley.org/authors/{}'.format(author), 'author': author, 'authorId': author, 'ageRating': 'PG', 'language': 'English', 'favorites': 0, 'follows': 0, 'reviews': 0, 'url': url, 'lastUrl': lastUrl, 'type': FicType.fictionalley, 'lid': lid, 'ficStatus': Status.complete, 'wordCount': -1, 'description': 'FILL IN MY DESCRIPTION', 'title': 'FILL IN MY TITLE', 'published': 'FILL IN MY PUBLISHED DATE', 'updated': 'FILL IN MY UPDATED DATE', 'added': int(time.time()), 'fetched': int(time.time()) } fic = Fic.new().__dict__ fic = inflateObject(fic, faRename, faDefaults) fic['chapters'] = {} fic['chapterCount'] = chapterCount for cid in range(1, chapterCount + 1): chapterUrl = url[:-5] + '{:02}.html'.format(cid) if chapterCount == 1: chapterUrl = url[:-5] + '01a.html' fic['chapters'][cid] = { 'lastModified': int(time.time()), 'status': Status.ongoing, 'fetched': int(time.time()), 'url': chapterUrl } contentDir = './content/{}/{}/{}'.format(FicType.fictionalley, lid, cid) if not os.path.isdir(contentDir): os.makedirs(contentDir) return fic
def importFic(fdata): global ficImportRename ofic = inflateObject(fdata.copy(), ficImportRename) fic = Fic.new() for field in ofic: print('setting "{}" to "{}"'.format(field, ofic[field])) fic.__dict__[field] = ofic[field] fic.published = util.parseDateAsUnix(fic.published, int(time.time())) fic.updated = util.parseDateAsUnix(fic.updated, int(time.time())) print('setting "{}" to "{}"'.format('published', fic.published)) print('setting "{}" to "{}"'.format('updated', fic.updated)) print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId)) fic.insert() for fandom in fdata['fandoms']: print(' adding fandom "{}"'.format(fandom)) fic.add(Fandom.define(fandom)) for character in fdata['characters']: print( ' adding character "{}" from fandom "{}"'.format( character['name'], character['fandom'] ) ) fic.add( Character.define(Fandom.define(character['fandom']), character['name']) ) for genre in fdata['genres']: print(' adding genre "{}"'.format(genre)) fic.add(Genre.define(genre)) for tag in fdata['tags']: print(' adding tag "{}"'.format(tag)) fic.add(Tag.define(tag)) cids = [int(cid) for cid in fdata['chapters']] cids.sort() for cid in cids: print(' adding chapter {}'.format(cid)) ochap = fdata['chapters'][str(cid)] chapter = FicChapter.new() chapter.fic = fic chapter.ficId = fic.id chapter.chapterId = cid for field in ochap: chapter.__dict__[field] = ochap[field] contentPath = './content/{}/{}/{}/content.html'.format( fic.type, fic.localId, cid ) if os.path.isfile(contentPath): html = None with open(contentPath, 'r') as f: html = f.read() print(' has content: {}'.format(len(html))) chapter.setHtml(html) chapter.insert()
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId, 1) # scrape fresh info data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data) chapter.localChapterId = str(1) chapter.url = self.constructUrl(fic.localId, 1) chapter.upsert() return Fic.lookup((fic.id, ))
def buildUrl(self, chapter: 'FicChapter') -> str: # TODO: do we need these 2 lines or will they always be done by however # FicChapter is created? if chapter.fic is None: chapter.fic = Fic.lookup((chapter.ficId, )) if chapter.localChapterId is None: raise Exception('chapter missing localChapterId? FIXME') return self.constructUrl(chapter.fic.localId, int(chapter.localChapterId))
def tryParseUrl(self, url: str) -> Optional[FicId]: if not url.startswith(self.baseUrl): return None # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: ftype = FicType(fic.sourceId) return FicId(ftype, fic.localId, chaps[0].chapterId, False) fics = Fic.select({'url': url}) if len(fics) == 1: ftype = FicType(fics[0].sourceId) return FicId(ftype, fics[0].localId) leftover = url[len(self.baseUrl):] if not leftover.endswith('.html'): return None ps = leftover.split('/') if len(ps) != 3 or ps[0] != 'authors': return None author = ps[1] storyId = ps[2] suffixes = ['01a.html', '.html'] for suffix in suffixes: if storyId.endswith(suffix): storyId = storyId[:-len(suffix)] # note: seems to be safe to lowercase these lid = (author + '/' + storyId).lower() #print(lid) # make lid author/story ? # TODO: we need some sort of local lid mapping... raise NotImplementedError()
def getCurrentInfo(self, fic: Fic) -> Fic: # grab the content from disk info = self.getArchiveStoryInfo(int(fic.localId)) spath = '{}/archive/{}/{}/summary.html.gz'.format( self.archivePath, info[1], info[2]) data = self.slurp(spath) fic = self.parseInfoInto(fic, data) fic.upsert() chapterCount = fic.chapterCount or 1 dCount = int(math.floor(math.log(chapterCount, 10) + 1)) localChapterIdMap = self.getChapterIds(int(fic.localId)) for cid in range(1, chapterCount + 1): pcid = str(cid).zfill(dCount) fpath = '{}/archive/{}/{}/chapters/chapter_{}.html.gz'.format( self.archivePath, info[1], info[2], pcid) data = self.slurp(fpath) chapter = fic.chapter(cid) chapter.localChapterId = localChapterIdMap[cid] chapter.setHtml(data) chapter.upsert() return Fic.lookup((fic.id, ))
def getDeepAuthorPosts(self, fic: Fic) -> Dict[str, Any]: from bs4 import BeautifulSoup urls = self.getDeepPageUrls(fic) soups: Dict[str, Any] = {} for url in urls: pageContent = self.scrapeLike(url) pageSoup = BeautifulSoup(pageContent, 'html5lib') posts = pageSoup.find_all(self.postContainer, { 'class': 'message', 'data-author': fic.getAuthorName() }) for post in posts: soups[post.get('id')] = post return soups
def getCurrentInfo(self, fic: Fic) -> Fic: # FIXME when fics are deleted they 404: # https://www.royalroad.com/fiction/38947/ # 404 # Page Not Found # The server has returned the following error: # This fiction has been deleted fic.url = self.constructUrl(fic.localId) data = self.scrape(fic.url) if 'raw' not in data: raise Exception('unable to scrape? FIXME') raw = data['raw'] return self.parseInfoInto(fic, raw)
def v0_cache(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() fic = fics[0] if fic.chapterCount is None: print(f'err: fic has no chapter count: {fic.id}') return Err.urlId_not_found.get() for cid in range(1, fic.chapterCount + 1): try: chapter = fic.chapter(cid) chapter.cache() except Exception as e: return Err.failed_to_cache_cid.get({'arg': f'{fic.id}/{cid}'}) return Err.ok(fic.toJSONable())
def getDeepAuthorPostUrls(self, fic: Fic) -> List[str]: urls = self.getDeepPageUrls(fic) util.logMessage( f'XenForo.getDeepAuthorPostUrls|deep page urls: {urls}') # TODO this should probably be more comprehensive... author = fic.getAuthorName() altAuthor = author.replace("'", ''') postUrls: List[str] = [] seenIdStubs = set() for url in urls: pageContent = self.scrapeLike(url) # See getReaderPostUrls for a fully parsed version for b in pageContent.split('<'): e = b.find('>') if e == -1: continue s = b[:e] # TODO FIXME this is bad :( # looking for li or article (the post container) if not (b.startswith('li id=') or b.startswith('article class=')): continue # check for 'message' -- simulates checking for message class if not 'message' in s: continue # to check the data-author we simply look for the author and hope # there aren't collisions if s.find(author) < 0 and s.find(altAuthor) < 0: continue # loop over spaced tokens looking for an unspaced id attribute for sb in s.split(): if not sb.startswith('id="') or not sb.endswith('"'): continue idStub = sb[len('id="'):-1] if idStub.startswith('js-'): idStub = idStub[len('js-'):] postUrl = url + '#' + idStub if idStub not in seenIdStubs: postUrls += [postUrl] seenIdStubs |= {idStub} util.logMessage(f'XenForo.getDeepAuthorPostUrls|postUrls: {postUrls}') return postUrls
def v0_lookup() -> Any: q = request.args.get('q', '').strip() if len(q.strip()) < 1: return Err.no_query.get({'arg': q}) print(f'v0_lookup: query: {q}') ficId = FicId.tryParse(q) if ficId is None: return Err.bad_query.get({'arg': q}) print(f'v0_lookup: ficId: {ficId.__dict__}') try: fic = Fic.load(ficId) return v0_fic(fic.urlId) except: print('v0_lookup: something went wrong in load:') traceback.print_exc() pass return Err.bad_ficId.get({'arg': ficId.__dict__})