def createFromZList(self, fic: Fic, ts: int, data: str) -> Fic: fic.url = self.constructUrl(fic.localId, 1) fic = self.parseZListInfoInto(fic, ts, data) fic.upsert() return Fic.lookup((fic.id, ))
def getCurrentInfo(self, fic: Fic) -> Fic: fic.url = self.baseUrl + str(fic.localId) url = fic.url.split('?')[0] + '?view_adult=true' # scrape fresh info data = scrape.scrape(url) return self.parseInfoInto(fic, data['raw'])
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() return Fic.lookup((fic.id, ))
def getCurrentInfo(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) url = self.tocUrl data = scrape.scrape(url) edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'wavesarisen_ec') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) edumpContent(data['raw'], 'sugarquill') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) time.sleep(self.baseDelay) edumpContent(data['raw'], 'hpffa') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: # TODO: should we try to get the actual url here, including the url safe # version of the title before the lid? Needs done elsewhere in this # adapter as well fic.url = self.baseUrl + 'threads/' + str(fic.localId) # scrape fresh info data = self.scrapeLike(fic.url) fic = self.parseInfoInto(fic, data) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId, 1) # scrape fresh info data = scrape.scrape(fic.url) fic = self.parseInfoInto(fic, data['raw']) fic.insert() chapter = fic.chapter(1) chapter.setHtml(data['raw']) chapter.upsert() return Fic.lookup((fic.id, ))
def getCurrentInfo(self, fic: Fic) -> Fic: # FIXME when fics are deleted they 404: # https://www.royalroad.com/fiction/38947/ # 404 # Page Not Found # The server has returned the following error: # This fiction has been deleted fic.url = self.constructUrl(fic.localId) data = self.scrape(fic.url) if 'raw' not in data: raise Exception('unable to scrape? FIXME') raw = data['raw'] return self.parseInfoInto(fic, raw)
def create(self, fic: Fic) -> Fic: fic.url = self.baseUrl + str(fic.localId) # scrape fresh info url = fic.url.split('?')[0] + '?view_adult=true' data = scrape.scrape(url) edumpContent(data['raw'], 'ao3') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data['raw']) chapter.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId, 1) # scrape fresh info data = scrape.softScrape(fic.url) if data is None: raise Exception('unable to scrape? FIXME') fic = self.parseInfoInto(fic, data) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data) chapter.localChapterId = str(1) chapter.url = self.constructUrl(fic.localId, 1) chapter.upsert() return Fic.lookup((fic.id, ))
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') divDetails = soup.find_all('div', {'class': 'details'}) if len(divDetails) != 1: raise Exception('error: unable to find details\n') else: divDetails = divDetails[0] text = divDetails.get_text() pt_str = str(divDetails) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? divTitle = soup.find_all('div', {'class': 'title'}) if len(divTitle) == 1: fic.title = divTitle[0].get_text().strip() else: raise Exception( 'error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1) # TODO: this may not exist on fictionhunt? fic.description = 'archive of {} from fictionhunt TODO'.format( fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('- Complete -', text) if match is None: fic.ficStatus = FicStatus.ongoing else: fic.ficStatus = FicStatus.complete for a in divDetails.find_all('a'): a_href = a.get('href') if a_href.find('fanfiction.net/u/') != -1: author = a.get_text() authorUrl = a_href authorId = a_href.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: hardcode Harry Potter fanfic? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'}) if len(storyMainInfo) != 1: raise Exception('unable to find main story info') storyMainInfo = storyMainInfo[0] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=" for a in soup.findAll('a'): href = a.get('href') if (not href.startswith(disclaimerJs) and href != '?psid={}'.format(fic.localId)): continue fic.title = a.getText() break else: raise Exception('error: unable to find title') fic.url = self.constructUrl(fic.localId) storySummaryTable = soup.findAll('table', {'class': 'storysummary'}) if len(storySummaryTable) != 1: raise Exception('cannot find story summary table') storySummaryTable = storySummaryTable[0] fic.description = (storySummaryTable.getText().strip()) if fic.description is None: raise Exception('error: unable to find description') # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 text = storyMainInfo.getText().replace('\xa0', ' ') matcher = RegexMatcher( text, { 'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str), 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\d+)', int), 'reviewCount': ('Story Reviews:\s*(\d+)', int), 'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int), 'updated': ('Last Updated:\s+(\S+)', str), 'published': ('First Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)', text) if match is None: raise Exception('cannot find write status') status = match.group(1) if status == 'Completed': fic.ficStatus = FicStatus.complete elif status == 'Work In Progress': fic.ficStatus = FicStatus.ongoing # should these be abandoned? elif status == 'Abandoned': fic.ficStatus = FicStatus.abandoned else: raise Exception('unknown status: {}'.format(status)) for a in soup.findAll('a'): a_href = a.get('href') if a_href.startswith('viewuser.php?showuid='): author = a.get_text() authorUrl = self.baseUrl + '/' + a_href authorId = a_href[len('viewuser.php?showuid='):] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: chars/pairings? fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: raise Exception('FIXME TODO fanfics me format has changed') from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html5lib') ficHead = soup.find('div', {'class': 'FicHead'}) titleH1 = ficHead.find('h1') fic.title = titleH1.getText().strip() fandoms: List[str] = [] trs = ficHead.findAll('div', {'class': 'tr'}) author = None for tr in trs: divTitle = tr.find('div', {'class': 'title'}) divContent = tr.find('div', {'class': 'content'}) t = str(divTitle.getText()).strip() v = str(divContent.getText()).strip() if t == 'Автор:': author = v elif t == 'Фандом:': if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling': fandoms += ['Harry Potter'] else: raise Exception('unknown fandom: ' + v) elif t == 'Статус:': if v == 'В процессе': fic.ficStatus = FicStatus.ongoing elif v == 'Закончен': fic.ficStatus = FicStatus.complete else: raise Exception('unknown write status: ' + v) elif t == 'Опубликован:': fic.published = self.parseRussianDate(v) elif t == 'Изменен:': fic.updated = self.parseRussianDate(v) elif t == 'Ссылка:': src = v # source archive url elif t == 'Читателей:': fic.followCount = int(v) elif t == 'Персонажи:': # characters, parse relationship? pass elif t == 'Рейтинг:': fic.ageRating = v elif t == 'Предупреждения:': # warnings? pass else: raise Exception('unknown metadata: ' + t) # TODO? assert (author is not None) authorUrl = author authorId = author self.setAuthor(fic, author, authorUrl, authorId) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.url is None: fic.url = self.constructUrl(fic.localId) summaryTextDiv = soup.find('div', {'class': 'summary_text'}) if summaryTextDiv is None: summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'}) fic.description = summaryTextDiv.getText() # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 if fic.followCount is None: fic.followCount = 0 fic.ageRating = 'M' ficContentsUl = soup.find('ul', {'class': 'FicContents'}) chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'}) fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) # try to get it out of current blob first if chapter.html() is None: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapter.setHtml( '<div class="ReadContent">' + str(contentDiv) + '</div>' ) if chapter.title is None or len(chapter.title) < 1: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() # fallback to scraping it directly if chapter.html() is None: cdata = scrape.softScrape(chapter.url) assert (cdata is not None) chapter.setHtml(self.extractContent(fic, cdata)) csoup = BeautifulSoup(cdata, 'html5lib') contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)}) chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount += len(chapter.cachedContent().split()) fic.wordCount = wordCount for fandom in fandoms: fic.add(Fandom.define(fandom)) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup authorLid = fic.localId.split('/')[0] storyLid = fic.localId.split('/')[1] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' soup = BeautifulSoup(wwwHtml, 'html5lib') pageHeader = soup.find('div', {'class': 'page-header'}) titleH2 = pageHeader.find('h2') fic.title = titleH2.getText().strip() authorLink = pageHeader.find('a') author = authorLink.getText().strip() authorId = authorLid authorUrl = self.baseStoryUrl.format(authorLid, 'contact/') self.setAuthor(fic, author, authorUrl, authorId) divWell = soup.find('div', {'class': 'well'}) summaryQuote = divWell.find('blockquote') fic.description = str( summaryQuote.getText() ).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ') while fic.description.find(' ') != -1: fic.description = fic.description.replace(' ', ' ') fic.description = fic.description.strip() divWellText = divWell.getText().strip() match = re.search('Status:\s*([^-]*) -', divWellText) if match is not None and match.group(1) == 'In progress': fic.ficStatus = FicStatus.ongoing else: raise Exception('unable to find fic status') RegexMatcher( divWellText, { 'ageRating': ('Rating\s*:\s+([^-]+) -', str), 'chapterCount': ('Chapters\s*:\s+(\d+) -', int), 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), } ).matchAll(fic) assert (fic.chapterCount is not None) if str(fic.wordCount).find(',') != -1: fic.wordCount = int(str(fic.wordCount).replace(',', '')) wellParent = divWell.parent cid = 0 wordCount = 0 reviewCount = 0 chapterDates: List[int] = [] for child in wellParent.children: if child.name != 'p': continue cid += 1 if str(child).find('Chapter {}'.format(cid)) == -1: continue chapterLink = child.find('a') expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower() if chapterLink.get('href').lower() != expectedUrl: raise Exception('unexpected chapter url: ' + chapterLink.get('href')) chInfo = ChapterInfo() RegexMatcher( child.getText(), { 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), 'reviewCount': ('Reviews\s*:\s+([^-]+) -', int), 'updated': ('Uploaded on\s*:\s+(.+)', str), } ).matchAll(chInfo) assert (chInfo.updated is not None) if str(chInfo.wordCount).find(',') != -1: chInfo.wordCount = int(str(chInfo.wordCount).replace(',', '')) wordCount += chInfo.wordCount reviewCount += chInfo.reviewCount dt = (util.parseDateAsUnix(chInfo.updated, int(time.time()))) chapterDates += [dt] # wordCount is already set from overall metadata fic.reviewCount = reviewCount fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.upsert() for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = 'Chapter_{}'.format(cid) ch.url = self.constructUrl(fic.localId, cid) ch.upsert() return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicTexts = [ # probably deleted by user 'Story Not FoundUnable to locate story. Code 1.', # probably deleted by admin 'Story Not FoundUnable to locate story. Code 2.', # unknown 'Story Not FoundStory is unavailable for reading. (A)', ] soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: for deletedFicText in deletedFicTexts: if gui_warning.get_text() == deletedFicText: if fic.ficStatus != FicStatus.complete: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) descriptionFound = False for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() descriptionFound = True break if descriptionFound == False: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 # TODO we should match this only on the section following the description matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Rated:.*Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = [] if preStoryLinks is not None: preStoryLinksLinks = preStoryLinks.find_all('a') pendingFandoms: List[Fandom] = [] for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in ffNetFandomCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/ if ( len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers") and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0 ): fIds = [int(hrefParts[2]), int(hrefParts[3])] pendingFandoms += self.handleCrossoverFandom( fic, hrefParts[1], fIds, href ) continue # if it's a regular fandom in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in ffNetFandomCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) pendingFandoms += self.handleFandom(fic, hrefParts[2]) continue util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href)) fic.upsert() poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId}) if len(poss) != 1: raise Exception(f'unable to upsert fic?') fic = poss[0] for pfandom in pendingFandoms: fic.add(pfandom) if fic.chapterCount is None: return fic chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = str(cid) ch.url = self.constructUrl(fic.localId, cid) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid) elif fic.chapterCount == 1 and cid == 1: ch.title = fic.title ch.upsert() metaSpan = profile_top.find('span', {'class': 'xgray'}) if metaSpan is not None: try: res = self.parseFicMetaSpan(metaSpan.decode_contents()) #fic.language = res["language"] # reconstruct fields = [ ('rated', 'Rated: Fiction ZZZ'), ('language', 'Language: ZZZ'), ('genres', 'Genre: ZZZ'), ('characters', 'Characters: ZZZ'), ('reviews', 'Reviews: ZZZ'), ('favorites', 'Favs: ZZZ'), ('follows', 'Follows: ZZZ'), ] rmeta = ' - '.join( [f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res] ) fic.extraMeta = rmeta publishedUts = util.parseDateAsUnix(res['published'], fic.fetched) fic.published = OilTimestamp(publishedUts) fic.updated = fic.published if 'updated' in res: updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.upsert() except Exception as e: util.logMessage( f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}' ) util.logMessage( f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}' ) pass return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup archive = fic.localId.split('/')[0] storyNo = fic.localId.split('/')[1] soup = BeautifulSoup(wwwHtml, 'html5lib') titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)}) fic.title = str(titleH2.getText()) membersUrl = 'http://members.adult-fanfiction.org/profile.php?no=' memberLink = soup.find( lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href") is not None and (t.get("href").startswith(membersUrl)))) author = memberLink.getText() authorId = memberLink.get('href')[len(membersUrl):] authorUrl = memberLink.get('href') self.setAuthor(fic, author, authorUrl, authorId) # TODO fic.ficStatus = FicStatus.ongoing fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1) # TODO: description is on search page if fic.description is None: fic.description = 'TODO: on the search page?' # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO if fic.published is None: fic.published = OilTimestamp.now() if fic.updated is None: fic.updated = fic.published chapterDropdown = soup.find('div', {'class': 'dropdown-content'}) chapterLinks = chapterDropdown.findAll('a') oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapterContent = scrape.softScrape( self.constructUrl(fic.localId, cid)) chapter = fic.chapter(cid) if chapterContent is not None: chapter.setHtml(chapterContent) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) chapter.title = chapterLinks[cid - 1].getText().strip() if chapter.title is not None: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() if chapterContent is not None: wordCount += len(chapterContent.split()) fic.wordCount = wordCount if oldChapterCount is not None and oldChapterCount < fic.chapterCount: fic.updated = OilTimestamp.now() # TODO fic.upsert() storyUrl = self.constructUrl(fic.localId, chapterId=None) # more metadata from search page searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title={}&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author, fic.title.replace(' ', '+')) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) # fallback to pure author search if storyUrl not in metas: searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title=&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) if storyUrl not in metas: raise Exception('cannot find search metadata') meta = metas[storyUrl] assert (meta.published is not None and meta.updated is not None) fic.published = OilTimestamp(meta.published) fic.updated = OilTimestamp(meta.updated) fic.reviewCount = meta.reviewCount fic.favoriteCount = meta.views # TODO fic.ficStatus = meta.ficStatus assert (meta.description is not None) fic.description = meta.description assert (fic.description is not None) if len(meta.tags) > 0: fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags) for fan in meta.fandoms: fic.add(Fandom.define(fan)) return fic
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) return self.getCurrentInfo(fic)
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? ficTitleDiv = soup.find('div', {'class': 'fic-title'}) fic.title = ficTitleDiv.find('h1').getText().strip() authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a') author = authorLink.getText().strip() authorUrl = self.baseUrl + authorLink.get('href') authorId = authorUrl.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) divDescription = soup.find('div', {'class': 'description'}) try: descView = HtmlView(str(divDescription), markdown=False) desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text]) fic.description = desc except: fic.description = divDescription.getText().strip() fictionInfo = str(soup.find('div', {'class': 'fiction-info'})) if fictionInfo.find('>ONGOING<') != -1: fic.ficStatus = FicStatus.ongoing elif fictionInfo.find('>COMPLETED<') != -1: fic.ficStatus = FicStatus.complete elif fictionInfo.find('>HIATUS<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>STUB<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>DROPPED<') != -1: fic.ficStatus = FicStatus.abandoned else: raise Exception('unable to find fic status') divStatsContent = soup.find('div', {'class': 'stats-content'}) followers = divStatsContent.find(text='Followers :') ul = followers.parent.parent RegexMatcher( ul.getText(), { 'followCount?': ('Followers\s+:\s+([\d,]+)', str), 'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str), } ).matchAll(fic) if str(fic.followCount).find(','): fic.followCount = int(str(fic.followCount).replace(',', '')) if str(fic.favoriteCount).find(','): fic.favoriteCount = int(str(fic.favoriteCount).replace(',', '')) tableChapters = soup.find('table', {'id': 'chapters'}) chapterLinks = tableChapters.findAll('a') chapterUrls: List[str] = [] chapterTitles: List[str] = [] for chapterLink in chapterLinks: # TODO FIXME is this inverted? if chapterLink.find('time') is not None: continue chapterUrls += [chapterLink.get('href')] chapterTitles += [chapterLink.getText().strip()] chapterDates: List[int] = [] for chapterLink in chapterLinks: if chapterLink.find('time') is None: continue timeElement = chapterLink.find('time') if timeElement.get('unixtime'): chapterDates += [int(timeElement.get('unixtime'))] else: chapterDates += [ util.parseDateAsUnix(timeElement.get('title'), fic.fetched) ] fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.chapterCount = len(chapterUrls) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.url = self.baseUrl + chapterUrls[cid - 1] if chapterUrls[cid - 1].startswith('/fiction/chapter/'): # alternate chapter syntax if the chapter itself has no slug # /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug> chapter.localChapterId = ( chapterUrls[cid - 1].split('/')[3].split('?')[0] ) else: # standard chapter syntax # /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug> chapter.localChapterId = chapterUrls[cid - 1].split('/')[5] chapter.title = chapterTitles[cid - 1] if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) if chapter.html() is None: chapter.cache() chapter.upsert() chtml = chapter.html() if chtml is not None: wordCount += len(chtml.split()) fic.wordCount = wordCount return fic
def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic: # existing data is newer, do nothing if fic.fetched is not None and fic.fetched.toUTS() > ts: return fic from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html5lib') text = soup.get_text() pt_str = str(html) fic.fetched = OilTimestamp(ts) fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1, fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 for a in soup.find_all('a', {'class': 'stitle'}): fic.title = a.getText() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) for div in soup.find_all('div', {'class': 'z-padtop'}): fic.description = div.contents[0] break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in soup.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) zl = soup.find('div', {'class': 'z-list'}) fan = None if zl is None else zl.get('data-category') pendingFandoms: List[Fandom] = [] if fan is not None: pendingFandoms += self.handleFandom(fic, fan) # TODO: crossovers? #print('---') #print(fic.__dict__) #raise Exception('todo') fic.upsert() for pfandom in pendingFandoms: fic.add(pfandom) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicText = 'Story Not FoundUnable to locate story. Code 1.' soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: if gui_warning.get_text() == deletedFicText: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(\S+)', text) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(1) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}'.format(status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = preStoryLinks.find_all('a') for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in fictionPressCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a regular genre in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in fictionPressCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) # ensure it's in our whitelist if hrefParts[2] not in fictionPressGenres: util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}') continue fic.add(Fandom.define(hrefParts[2])) continue util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}') continue fic.upsert() chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1) elif fic.chapterCount == 1 and cid == 0: ch.title = fic.title ch.upsert() return fic