def tryParseUrl(self, url: str) -> Optional[FicId]: parts = url.split('/') httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:') if len(parts) < 4: return None if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps): return None storyLid = parts[3] authorLid = parts[2].split('.')[0] lid = '{}/{}'.format(authorLid, storyLid) ficId = FicId(self.ftype, lid) if len(parts) > 4 and parts[4].startswith('Chapter_'): cid = int(parts[4][len('Chapter_'):]) ficId.chapterId = cid ficId.ambiguous = False return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: mapPrefixes = ['http://www.', 'http://', 'https://www.'] hasPrefix = True while hasPrefix: hasPrefix = False for pref in mapPrefixes: if url.startswith(pref): hasPrefix = True url = 'https://' + url[len(pref):] endsToStrip = [ '#main', '#work_endnotes', '#bookmark-form', '?view_adult=true', '?view_full_work=true', '?viewfullwork=true', '?show_comments=true', ] for send in endsToStrip: if url.endswith(send): url = url[:-len(send)] if url.find('#') >= 0: url = url[:url.find('#')] if url.find('?') >= 0: url = url[:url.find('?')] # TODO: this should probably return a FicId pointing to this chapter and # not just this fic in general... if url.find('/chapters/') >= 0 and url.find('/works/') < 0: meta = scrape.softScrapeWithMeta(url, delay=10) if meta is None or meta['raw'] is None or meta['status'] != 200: raise Exception('unable to lookup chapter: {}'.format(url)) from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(meta['raw'], 'html5lib') for a in soup.find_all('a'): if a.get_text() == 'Entire Work': return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):]) else: raise Exception('unable to lookup chapters entire work: {}'.format(url)) if url.startswith(self.collectionUrl) and url.find('/works/') != -1: url = self.baseUrl + url[url.find('/works/') + len('/works/'):] if not url.startswith(self.baseUrl): return None pieces = url[len(self.baseUrl):].split('/') lid = pieces[0] if len(lid) < 1 or not lid.isnumeric(): return None ficId = FicId(FicType.ao3, lid) fic = Fic.tryLoad(ficId) if fic is None: return ficId if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric(): localChapterId = pieces[2] mchaps = FicChapter.select( { 'ficId': fic.id, 'localChapterId': localChapterId } ) if len(mchaps) == 1: ficId.chapterId = mchaps[0].chapterId ficId.ambiguous = False return ficId