def scrapeLike(self, url: str, delay: int = None) -> str: url = scrape.canonizeUrl(url) if delay is None: delay = self.defaultDelay prefix = self.baseUrl + 'threads/' if not url.startswith(prefix): data = scrape.softScrape(url, delay, mustyThreshold=self.mustyThreshold) if data is None: raise Exception('unable to soft scrape? FIXME') return data ulike = url[len(prefix):] parts = ulike.split('/') parts[0] = parts[0].split('.')[-1] canon = prefix + '/'.join(parts) parts[0] = '%.' + parts[0] ulike = prefix + '/'.join(parts) # FIXME canon may find an older url than ulike :/ canonRes = scrape.getMostRecentScrapeWithMeta(canon) if (canonRes is not None and int(time.time()) - self.mustyThreshold < canonRes['fetched']): return cast(str, canonRes['raw']) data = scrape.softScrape(url, delay, ulike, mustyThreshold=self.mustyThreshold) if data is None: raise Exception('unable to soft scrape? FIXME') return data
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() return Fic.lookup((fic.id, ))
def getChapterPublishDate(self, url: str) -> OilTimestamp: from bs4 import BeautifulSoup url = self.canonizeUrl(url) data = scrape.softScrape(url) soup = BeautifulSoup(data, 'html5lib') publishTimes = soup.findAll('time', {'class': ['entry-date', 'published']}) if len(publishTimes) != 1: raise Exception('cannot find publish time for {}'.format(url)) uts = util.dtToUnix( dateutil.parser.parse(publishTimes[0].get('datetime'))) return OilTimestamp(uts)
def getChapterUrls(self, data: str = None) -> List[str]: from bs4 import BeautifulSoup # type: ignore if data is None: data = scrape.softScrape(self.tocUrl) soup = BeautifulSoup(data, 'html5lib') entryContents = soup.findAll('div', {'class': 'entry-content'}) chapterUrls: List[str] = [] for entryContent in entryContents: aTags = entryContent.findAll('a') for aTag in aTags: if aTag.get('href') is None: continue href = self.canonizeUrl(aTag.get('href')) if href in chapterUrls: continue chapterUrls += [href] return chapterUrls
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId, 1) # scrape fresh info data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data) chapter.localChapterId = str(1) chapter.url = self.constructUrl(fic.localId, 1) chapter.upsert() return Fic.lookup((fic.id, ))
def softScrape(self, chapter: FicChapter) -> Optional[str]: import scrape html = scrape.softScrape(chapter.url) if html is None: return html # TODO well this is a nightmare... if html.find('You are being redirected') < 0: return html import re match = re.search("window.location = ['\"]([^'\"]*)['\"];", html) if match is None or match.group(1) is None: return html if chapter.url == match.group(1): raise Exception('redirect loop') chapter.url = match.group(1) chapter.upsert() return self.softScrape(chapter)
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: raise Exception('FIXME TODO fanfics me format has changed') from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html5lib') ficHead = soup.find('div', {'class': 'FicHead'}) titleH1 = ficHead.find('h1') fic.title = titleH1.getText().strip() fandoms: List[str] = [] trs = ficHead.findAll('div', {'class': 'tr'}) author = None for tr in trs: divTitle = tr.find('div', {'class': 'title'}) divContent = tr.find('div', {'class': 'content'}) t = str(divTitle.getText()).strip() v = str(divContent.getText()).strip() if t == 'Автор:': author = v elif t == 'Фандом:': if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling': fandoms += ['Harry Potter'] else: raise Exception('unknown fandom: ' + v) elif t == 'Статус:': if v == 'В процессе': fic.ficStatus = FicStatus.ongoing elif v == 'Закончен': fic.ficStatus = FicStatus.complete else: raise Exception('unknown write status: ' + v) elif t == 'Опубликован:': fic.published = self.parseRussianDate(v) elif t == 'Изменен:': fic.updated = self.parseRussianDate(v) elif t == 'Ссылка:': src = v # source archive url elif t == 'Читателей:': fic.followCount = int(v) elif t == 'Персонажи:': # characters, parse relationship? pass elif t == 'Рейтинг:': fic.ageRating = v elif t == 'Предупреждения:': # warnings? pass else: raise Exception('unknown metadata: ' + t) # TODO? assert (author is not None) authorUrl = author authorId = author self.setAuthor(fic, author, authorUrl, authorId) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.url is None: fic.url = self.constructUrl(fic.localId) summaryTextDiv = soup.find('div', {'class': 'summary_text'}) if summaryTextDiv is None: summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'}) fic.description = summaryTextDiv.getText() # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 if fic.followCount is None: fic.followCount = 0 fic.ageRating = 'M' ficContentsUl = soup.find('ul', {'class': 'FicContents'}) chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'}) fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) # try to get it out of current blob first if chapter.html() is None: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapter.setHtml( '<div class="ReadContent">' + str(contentDiv) + '</div>' ) if chapter.title is None or len(chapter.title) < 1: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() # fallback to scraping it directly if chapter.html() is None: cdata = scrape.softScrape(chapter.url) assert (cdata is not None) chapter.setHtml(self.extractContent(fic, cdata)) csoup = BeautifulSoup(cdata, 'html5lib') contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)}) chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount += len(chapter.cachedContent().split()) fic.wordCount = wordCount for fandom in fandoms: fic.add(Fandom.define(fandom)) return fic
def softScrape(self, chapter: FicChapter) -> Optional[str]: import scrape return scrape.softScrape(chapter.url)
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup archive = fic.localId.split('/')[0] storyNo = fic.localId.split('/')[1] soup = BeautifulSoup(wwwHtml, 'html5lib') titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)}) fic.title = str(titleH2.getText()) membersUrl = 'http://members.adult-fanfiction.org/profile.php?no=' memberLink = soup.find( lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href") is not None and (t.get("href").startswith(membersUrl)))) author = memberLink.getText() authorId = memberLink.get('href')[len(membersUrl):] authorUrl = memberLink.get('href') self.setAuthor(fic, author, authorUrl, authorId) # TODO fic.ficStatus = FicStatus.ongoing fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1) # TODO: description is on search page if fic.description is None: fic.description = 'TODO: on the search page?' # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO if fic.published is None: fic.published = OilTimestamp.now() if fic.updated is None: fic.updated = fic.published chapterDropdown = soup.find('div', {'class': 'dropdown-content'}) chapterLinks = chapterDropdown.findAll('a') oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapterContent = scrape.softScrape( self.constructUrl(fic.localId, cid)) chapter = fic.chapter(cid) if chapterContent is not None: chapter.setHtml(chapterContent) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) chapter.title = chapterLinks[cid - 1].getText().strip() if chapter.title is not None: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() if chapterContent is not None: wordCount += len(chapterContent.split()) fic.wordCount = wordCount if oldChapterCount is not None and oldChapterCount < fic.chapterCount: fic.updated = OilTimestamp.now() # TODO fic.upsert() storyUrl = self.constructUrl(fic.localId, chapterId=None) # more metadata from search page searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title={}&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author, fic.title.replace(' ', '+')) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) # fallback to pure author search if storyUrl not in metas: searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title=&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) if storyUrl not in metas: raise Exception('cannot find search metadata') meta = metas[storyUrl] assert (meta.published is not None and meta.updated is not None) fic.published = OilTimestamp(meta.published) fic.updated = OilTimestamp(meta.updated) fic.reviewCount = meta.reviewCount fic.favoriteCount = meta.views # TODO fic.ficStatus = meta.ficStatus assert (meta.description is not None) fic.description = meta.description assert (fic.description is not None) if len(meta.tags) > 0: fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags) for fan in meta.fandoms: fic.add(Fandom.define(fan)) return fic