コード例 #1
0
    def handleMetadataPair(self, key, value):
        """
        Handles a key-value pair of story metadata.

        Returns straight away if the value is 'None' (that's a string)

        Can be overridden by subclasses::
            def handleMetadataPair(self, key, value):
                if key == 'MyCustomKey':
                    self.story.setMetadata('somekye', value)
                else:
                    super(NameOfMyAdapter, self).handleMetadata(key, value)
        """
        if value == "None":
            return
        elif key == "Summary":
            self.setDescription(self.url, value)
        elif "Genre" in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList("genre", val)
        elif "Warning" in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList("warnings", val)
        elif "Characters" in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList("characters", val)
        elif "Categories" in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList("category", val)
        elif "Challenges" in key:
            for val in re.split("\s*,\s*", value):
                # TODO this should be an official field I guess
                self.story.addToList("challenge", val)
        elif key == "Chapters":
            self.story.setMetadata("numChapters", int(value))
        elif key == "Rating" or key == "Rated":
            self.story.setMetadata("rating", value)
        elif key == "Word count":
            self.story.setMetadata("numWords", value)
        elif key == "Completed":
            if "Yes" in value:
                self.story.setMetadata("status", "Completed")
            else:
                self.story.setMetadata("status", "In-Progress")
        elif key == "Read":
            # TODO this should be an official field I guess
            self.story.setMetadata("readings", value)
        elif key == "Published":
            self.story.setMetadata("datePublished", makeDate(value, self.getDateFormat()))
        elif key == "Updated":
            self.story.setMetadata("dateUpdated", makeDate(value, self.getDateFormat()))
        elif key == "Pairing":
            for val in re.split("\s*,\s*", value):
                self.story.addToList("ships", val)
        elif key == "Series":
            ## TODO is not a link in the printable view, so no seriesURL possible
            self.story.setMetadata("series", value)
        else:
            logger.info("Unhandled metadata pair: '%s' : '%s'" % (key, value))
コード例 #2
0
    def handleMetadataPair(self, key, value):
        """
        Handles a key-value pair of story metadata.

        Returns straight away if the value is 'None' (that's a string)

        Can be overridden by subclasses::
            def handleMetadataPair(self, key, value):
                if key == 'MyCustomKey':
                    self.story.setMetadata('somekye', value)
                else:
                    super(NameOfMyAdapter, self).handleMetadata(key, value)
        """
        if value == 'None':
            return
        elif key == 'Summary':
            self.setDescription(self.url, value)
        elif 'Genre' in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList('genre', val)
        elif 'Warning' in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList('warnings', val)
        elif 'Characters' in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList('characters', val)
        elif 'Categories' in key:
            for val in re.split("\s*,\s*", value):
                self.story.addToList('category', val)
        elif 'Challenges' in key:
            for val in re.split("\s*,\s*", value):
                # TODO this should be an official field I guess
                self.story.addToList('challenge', val)
        elif key == 'Chapters':
            self.story.setMetadata('numChapters', int(value))
        elif key == 'Rating' or key == 'Rated':
            self.story.setMetadata('rating', value)
        elif key == 'Word count':
            self.story.setMetadata('numWords', value)
        elif key == 'Completed':
            if 'Yes' in value:
                self.story.setMetadata('status', 'Completed')
            else:
                self.story.setMetadata('status', 'In-Progress')
        elif key == 'Read':
            # TODO this should be an official field I guess
            self.story.setMetadata('readings', value)
        elif key == 'Published':
            self.story.setMetadata('datePublished', makeDate(value, self.getDateFormat()))
        elif key == 'Updated':
            self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat()))
        elif key == 'Pairing':
            for val in re.split("\s*,\s*", value):
                self.story.addToList('ships', val)
        elif key == 'Series':
            ## TODO is not a link in the printable view, so no seriesURL possible 
            self.story.setMetadata('series', value)
        else:
            logger.info("Unhandled metadata pair: '%s' : '%s'" % (key, value))
コード例 #3
0
    def parseOtherAttributes(self, other_attribute_element):
        for b in other_attribute_element.findAll('b'):
            #logger.debug('Getting metadata: "%s"' % b)
            label = b.text
            if label in ['Posted:', 'Concluded:', 'Updated:']:
                value = b.findNext('noscript').text
                #logger.debug('Have a date field label: "%s", value: "%s"' % (label, value))
            else:
                value = b.nextSibling
            #logger.debug('label: "%s", value: "%s"' % (label, value))

            if 'Sex' in label:
                self.story.setMetadata('rating', value)
            if 'Score' in label and value != '-':
                self.story.setMetadata('score', value)

            if 'Tags' in label or 'Codes' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('sitetags', code)
            if 'Genre' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('genre', code)

            if 'Posted' in label:
                self.story.setMetadata(
                    'datePublished', makeDate(stripHTML(value),
                                              self.dateformat))
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))
            if 'Concluded' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))
            if 'Updated' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

        status = other_attribute_element.find('span', {'class': 'ab'})
        if status != None:
            if 'Incomplete and Inactive' in status.text:
                self.story.setMetadata('status', 'Incomplete')
            else:
                self.story.setMetadata('status', 'In-Progress')
            if "Last Activity" in status.text:
                # date is passed as a timestamp and converted in JS.
                value = status.findNext('noscript').text
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))
        else:
            self.story.setMetadata('status', 'Completed')
コード例 #4
0
 def parseDateText(text):
     if text == u'Вчера':
         return todayInMoscow() - datetime.timedelta(days=1)
     elif text == u'Сегодня':
         return todayInMoscow()
     else:
         return makeDate(text, '%d.%m.%Y')
コード例 #5
0
 def parseDateText(text):
     if text == u'Вчера':
         return todayInMoscow() - datetime.timedelta(days=1)
     elif text == u'Сегодня':
         return todayInMoscow()
     else:
         return makeDate(text, '%d.%m.%Y, %H:%M')
コード例 #6
0
 def getDateFromComponents(self, postmonth, postday):
     ym = re.search(
         "Entries in (?P<mon>January|February|March|April|May|June|July|August|September|October|November|December) (?P<year>\d{4})",
         stripHTML(postmonth),
     )
     d = re.search("(?P<day>\d{2})\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", stripHTML(postday))
     postdate = makeDate(d.group("day") + " " + ym.group("mon") + " " + ym.group("year"), self.dateformat)
     return postdate
コード例 #7
0
    def parseOtherAttributes(self, other_attribute_element):
        for b in other_attribute_element.findAll('b'):
            #logger.debug('Getting metadata: "%s"' % b)
            label = b.text
            if label in ['Posted:', 'Concluded:', 'Updated:']:
                value = b.findNext('noscript').text
                #logger.debug('Have a date field label: "%s", value: "%s"' % (label, value))
            else:
                value = b.nextSibling
            #logger.debug('label: "%s", value: "%s"' % (label, value))

            if 'Sex' in label:
                self.story.setMetadata('rating', value)
            if 'Score' in label and value != '-':
                self.story.setMetadata('score', value)

            if 'Tags' in label or 'Codes' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('sitetags', code)
            if 'Genre' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('genre', code)

            if 'Posted' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
            if 'Concluded' in label:
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))

        status = other_attribute_element.find('span', {'class':'ab'})
        if status != None:
            if 'Incomplete and Inactive' in status.text:
                self.story.setMetadata('status', 'Incomplete')
            else:
                self.story.setMetadata('status', 'In-Progress')
            if "Last Activity" in status.text:
                # date is passed as a timestamp and converted in JS.
                value = status.findNext('noscript').text
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
        else:
            self.story.setMetadata('status', 'Completed')
コード例 #8
0
 def getDateFromComponents(self, postmonth, postday):
     ym = re.search(
         "Entries in (?P<mon>January|February|March|April|May|June|July|August|September|October|November|December) (?P<year>\d{4})",
         stripHTML(postmonth))
     d = re.search(
         "(?P<day>\d{2})\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)",
         stripHTML(postday))
     postdate = makeDate(
         d.group('day') + ' ' + ym.group('mon') + ' ' + ym.group('year'),
         self.dateformat)
     return postdate
コード例 #9
0
 def _parseDate(self):
     """Locate and parse chapter date."""
     try:
         dateText = self._getInfoBarElement() \
             .find('i', {'class': 'icon-eye'}) \
             .findPreviousSibling(text=True) \
             .strip(u'| \n')
     except AttributeError:
         raise ParsingError(u'Failed to locate date.')
     date = makeDate(dateText, '%d.%m.%Y')
     return date
コード例 #10
0
 def date_span_tag_to_date(self, containingtag):
     ## <span data-time="1435421997" title="Saturday 27th of June 2015 @4:19pm">Jun 27th, 2015</span>
     ## No timezone adjustment is done.
     span = containingtag.find('span',{'data-time':re.compile(r'^\d+$')})
     if span != None:
         return datetime.fromtimestamp(float(span['data-time']))
     ## Sometimes, for reasons that are unclear, data-time is not present. Parse the date out of the title instead.
     else:
         span = containingtag.find('span', title=True)
         dateRegex = re.search('([a-zA-Z ]+)([0-9]+)(st of|th of|nd of|rd of)([a-zA-Z ]+[0-9]+)', span['title'])
         dateString = dateRegex.group(2) + dateRegex.group(4)
         return makeDate(dateString, "%d %B %Y")
コード例 #11
0
 def date_span_tag_to_date(self, containingtag):
     ## <span data-time="1435421997" title="Saturday 27th of June 2015 @4:19pm">Jun 27th, 2015</span>
     ## No timezone adjustment is done.
     span = containingtag.find('span', {'data-time': re.compile(r'^\d+$')})
     if span != None:
         return datetime.fromtimestamp(float(span['data-time']))
     ## Sometimes, for reasons that are unclear, data-time is not present. Parse the date out of the title instead.
     else:
         span = containingtag.find('span', title=True)
         dateRegex = re.search(
             '([a-zA-Z ]+)([0-9]+)(st of|th of|nd of|rd of)([a-zA-Z ]+[0-9]+)',
             span['title'])
         dateString = dateRegex.group(2) + dateRegex.group(4)
         return makeDate(dateString, "%d %B %Y")
コード例 #12
0
    def extract_threadmarks(self, souptag):
        # try threadmarks if no '#' in url
        navdiv = souptag.find('div', {'class': 'threadmarkMenus'})  # SB/SV
        if not navdiv:
            return []
        threadmarksas = navdiv.find_all('a', {'class': 'threadmarksTrigger'})

        ## Loop on threadmark categories.
        threadmarks = []
        tmcat_num = None

        # convenience method.
        def xml_tag_string(dom, tag):
            return dom.getElementsByTagName(tag)[0].firstChild.data.encode(
                "utf-8")

        for threadmarksa in threadmarksas:
            threadmark_rss_dom = parseString(
                self._fetchUrl(self.getURLPrefix() + '/' +
                               threadmarksa['href'].replace(
                                   'threadmarks', 'threadmarks.rss')))
            # print threadmark_rss_dom.toxml(encoding='utf-8')

            tmcat_num = threadmarksa['href'].split('category_id=')[1]
            tmcat_name = stripHTML(threadmarksa)

            for tmcat_index, item in enumerate(
                    threadmark_rss_dom.getElementsByTagName("item")):
                title = xml_tag_string(item, "title")
                url = xml_tag_string(item, "link")
                author = xml_tag_string(item, "dc:creator")
                date = xml_tag_string(item, "pubDate")
                ## Fri, 23 Jun 2017 16:52:57 +0000
                date = makeDate(
                    date[5:-6],
                    '%d %b %Y %H:%M:%S')  # toss day-of-week and TZ--locales.
                threadmarks.append({
                    "tmcat_name": tmcat_name,
                    "tmcat_num": tmcat_num,
                    "tmcat_index": tmcat_index,
                    "title": title,
                    "url": url,
                    "date": date,
                    "author": author
                })
        return threadmarks
コード例 #13
0
 def make_date(self,parenttag): # forums use a BS thing where dates
                                # can appear different if recent.
     datestr=None
     try:
         datetag = parenttag.find('span',{'class':'DateTime'})
         if datetag:
             datestr = datetag['title']
         else:
             datetag = parenttag.find('abbr',{'class':'DateTime'})
             if datetag:
                 datestr="%s at %s"%(datetag['data-datestring'],datetag['data-timestring'])
         # Apr 24, 2015 at 4:39 AM
         # May 1, 2015 at 5:47 AM
         datestr = re.sub(r' (\d[^\d])',r' 0\1',datestr) # add leading 0 for single digit day & hours.
         return makeDate(datestr, self.dateformat)
     except:
         logger.debug('No date found in %s'%parenttag,exc_info=True)
         return None
コード例 #14
0
 def make_date(self,parenttag): # forums use a BS thing where dates
                               # can appear different if recent.
     datestr=None
     try:
         datetag = parenttag.find('span',{'class':'DateTime'})
         if datetag:
             datestr = datetag['title']
         else:
             datetag = parenttag.find('abbr',{'class':'DateTime'})
             if datetag:
                 datestr="%s at %s"%(datetag['data-datestring'],datetag['data-timestring'])
         # Apr 24, 2015 at 4:39 AM
         # May 1, 2015 at 5:47 AM
         datestr = re.sub(r' (\d[^\d])',r' 0\1',datestr) # add leading 0 for single digit day & hours.
         return makeDate(datestr, self.dateformat)
     except:
         logger.debug('No date found in %s'%parenttag)
         return None
コード例 #15
0
    def getChapterText(self, url):

        logger.debug('Getting chapter text from: %s' % url)

        data = self._fetchUrl(url)
        soup = self.make_soup(data)

        headerstr = stripHTML(soup.find('div',{'class':'post-meta clearfix '}))
        # print("data:%s"%data)
        #header.extract()

        m = re.match(r".*?Uploaded On: ([a-zA-Z]+ \d\d, \d\d\d\d \d\d:\d\d)",headerstr)
        if m:
            date = makeDate(m.group(1), self.dateformat)
            if not self.story.getMetadataRaw('datePublished') or date < self.story.getMetadataRaw('datePublished'):
                self.story.setMetadata('datePublished', date)
                
        chapter = soup.find('div',{'id':'fanfic-text'})
        
        return self.utf8FromSoup(url,chapter)
コード例 #16
0
 def ordinal_date_string_to_date(self, datestring):
     datestripped=re.sub(r"(\d+)(st|nd|rd|th)", r"\1", datestring.strip())
     return makeDate(datestripped, self.dateformat)
コード例 #17
0
                for genre in genres:
                    self.story.addToList('genre',genre.string)

            if 'Warnings' in label:
                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
                for warning in warnings:
                    self.story.addToList('warnings',warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
            
            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://'+self.host+'/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
コード例 #18
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url)

        # Since no 404 error code we have to raise the exception ourselves.
        # A title that is just 'by' indicates that there is no author name
        # and no story title available.
        if soup.title.string.strip() == 'by':
            raise exceptions.StoryDoesNotExist(self.url)

        # "storycontent" is found in a single-chapter story
        author_anchor = soup.find('div', id=lambda id: id in ('main', 'storycontent')).h1.a
        self.story.setMetadata('author', author_anchor.string)

        url_tokens = author_anchor['href'].split('/')
        author_id = url_tokens[url_tokens.index('authors')+1]
        self.story.setMetadata('authorId', author_id)
        self.story.setMetadata('authorUrl', self.AUTHORS_URL_TEMPLATE % author_id)

        chapter_anchors = soup('a', href=lambda href: href and href.startswith('/fanfiction/story/'))
        for chapter_anchor in chapter_anchors:
            url = urlparse.urljoin(self.BASE_URL, chapter_anchor['href'])
            self.chapterUrls.append((chapter_anchor.string, url))

        author_url = urlparse.urljoin(self.BASE_URL, author_anchor['href'])
        soup = self._customized_fetch_url(author_url)
        story_id = self.story.getMetadata('storyId')
        for listbox in soup('div', {'class': 'listbox'}):
            url_tokens = listbox.a['href'].split('/')
            # Found the div containing the story's metadata; break the loop and
            # parse the element
            if story_id == url_tokens[url_tokens.index('story')+1]:
                break
        else:
            raise exceptions.FailedToDownload(self.url)

        title = listbox.a.string
        self.story.setMetadata('title', title)

        # No chapter anchors found in the original story URL, so the story has
        # only a single chapter.
        if not chapter_anchors:
            self.chapterUrls.append((title, self.url))

        for b_tag in listbox('b'):
            key = b_tag.string.strip(':')
            try:
                value = b_tag.nextSibling.string.replace('&bull;', '').strip(': ')
            # This can happen with some fancy markup in the summary. Just
            # ignore this error and set value to None, the summary parsing
            # takes care of this
            except AttributeError:
                value = None

            if key == 'Summary':
                contents = []
                keep_summary_html = self.getConfig('keep_summary_html')

                for sibling in _yield_next_siblings(b_tag):
                    if isinstance(sibling, BeautifulSoup.Tag):
                        if sibling.name == 'b' and sibling.findPreviousSibling().name == 'br':
                            break

                        if keep_summary_html:
                            contents.append(self.utf8FromSoup(author_url, sibling))
                        else:
                            contents.append(''.join(sibling(text=True)))
                    else:
                        contents.append(sibling)

                # Pop last break line tag
                contents.pop()
                self.story.setMetadata('description', ''.join(contents))

            elif key == 'Category':
                for sibling in b_tag.findNextSiblings(['a', 'b']):
                    if sibling.name == 'b':
                        break

                    self.story.addToList('category', sibling.string)

            elif key == 'Rating':
                self.story.setMetadata('rating', value)

            elif key == 'Chapters':
                self.story.setMetadata('numChapters', int(value))

                # Also parse reviews number which lies right after the chapters
                # section
                reviews_anchor = b_tag.findNextSibling('a')
                reviews = reviews_anchor.string.split(' ')[1].strip('()')
                self.story.setMetadata('reviews', reviews)

            elif key == 'Completed':
                self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')

            elif key == 'Date Added':
                self.story.setMetadata('datePublished', makeDate(value, self.DATETIME_FORMAT))

            elif key == 'Last Updated':
                self.story.setMetadata('dateUpdated', makeDate(value, self.DATETIME_FORMAT))

            elif key == 'Read':
                self.story.setMetadata('readings', value.split()[0])

        if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
            raise exceptions.AdultCheckRequired(self.url)
コード例 #19
0
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'hp')
        self.is_adult = False

        # get storyId from url--url validation guarantees query is only psid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # normalized story URL.
        self._setURL('https://' + self.getSiteDomain() +
                     '/viewstory.php?psid=' +
                     self.story.getMetadata('storyId'))

    @staticmethod
    def getSiteDomain():
        return 'www.harrypotterfanfiction.com'

    @classmethod
    def getAcceptDomains(cls):
        return ['www.harrypotterfanfiction.com', 'harrypotterfanfiction.com']

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://www.harrypotterfanfiction.com/viewstory.php?psid=1234"

    def getSiteURLPattern(self):
        return r"https?" + re.escape("://") + r"(www\.)?" + re.escape(
            "harrypotterfanfiction.com/viewstory.php?psid=") + r"\d+$"

    def needToLoginCheck(self, data):
        if 'Registered Users Only' in data \
                or 'There is no such account on our website' in data \
                or "That password doesn't match the one in our database" in data:
            return True
        else:
            return False

    def extractChapterUrlsAndMetadata(self):

        url = self.url + '&index=1'
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
            raise exceptions.AccessDenied(
                self.getSiteDomain() +
                " says: Access denied. This story has not been validated by the adminstrators of this site."
            )
        elif "ERROR locating story meta for psid" in data:
            raise exceptions.StoryDoesNotExist(self.url)

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        ## Title
        a = soup.find('a',
                      href=re.compile(r'\?psid=' +
                                      self.story.getMetadata('storyId')))
        self.story.setMetadata('title', stripHTML(a))
        ## javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=290995'
        if "This story may contain adult themes." in a['href'] and not (
                self.is_adult or self.getConfig("is_adult")):
            raise exceptions.AdultCheckRequired(self.url)

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php\?showuid=\d+"))
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata('authorUrl',
                               'https://' + self.host + '/' + a['href'])
        self.story.setMetadata('author', a.string)

        ## hpcom doesn't give us total words--but it does give
        ## us words/chapter.  I'd rather add than fetch and
        ## parse another page.
        words = 0
        for tr in soup.find('table', {'class': 'text'}).findAll('tr'):
            tdstr = tr.findAll('td')[2].string
            if tdstr and tdstr.isdigit():
                words += int(tdstr)
        self.story.setMetadata('numWords', unicode(words))

        # Find the chapters:
        tablelist = soup.find('table', {'class': 'text'})
        for chapter in tablelist.findAll('a',
                                         href=re.compile(r'\?chapterid=\d+')):
            #javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
            # just in case there's tags, like <i> in chapter titles.
            chpt = re.sub(r'^.*?(\?chapterid=\d+).*?', r'\1', chapter['href'])
            self.chapterUrls.append(
                (stripHTML(chapter),
                 'https://' + self.host + '/viewstory.php' + chpt))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        ## Finding the metadata is a bit of a pain.  Desc is the only thing this color.
        desctable = soup.find('table', {'bgcolor': '#f0e8e8'})
        self.setDescription(url, desctable)
        #self.story.setMetadata('description',stripHTML(desctable))

        ## Finding the metadata is a bit of a pain.  Most of the meta
        ## data is in a center.table without a bgcolor.
        #for center in soup.findAll('center'):
        table = soup.find('table', {'class': 'storymaininfo'})
        if table:
            metastr = stripHTML(unicode(table)).replace('\n', ' ').replace(
                '\t', ' ')
            # Rating: 12+ Story Reviews: 3
            # Chapters: 3
            # Characters: Andromeda, Ted, Bellatrix, R. Lestrange, Lucius, Narcissa, OC
            # Genre(s): Fluff, Romance, Young Adult Era: OtherPairings: Other Pairing, Lucius/Narcissa
            # Status: Completed
            # First Published: 2010.09.02
            # Last Published Chapter: 2010.09.28
            # Last Updated: 2010.09.28
            # Favorite Story Of: 1 users
            # Warnings: Scenes of a Mild Sexual Nature

            m = re.match(r".*?Status: Completed.*?", metastr)
            if m:
                self.story.setMetadata('status', 'Completed')
            else:
                self.story.setMetadata('status', 'In-Progress')

            m = re.match(r".*?Rating: (.+?) Story Reviews.*?", metastr)
            if m:
                self.story.setMetadata('rating', m.group(1))

            m = re.match(r".*?Genre\(s\): (.+?) Era.*?", metastr)
            if m:
                for g in m.group(1).split(','):
                    self.story.addToList('genre', g)

            m = re.match(r".*?Characters: (.+?) Genre.*?", metastr)
            if m:
                for g in m.group(1).split(','):
                    self.story.addToList('characters', g)

            m = re.match(r".*?Warnings: (.+).*?", metastr)
            if m:
                for w in m.group(1).split(','):
                    if w != 'Now Warnings':
                        self.story.addToList('warnings', w)

            m = re.match(r".*?First Published: ([0-9\.]+).*?", metastr)
            if m:
                self.story.setMetadata('datePublished',
                                       makeDate(m.group(1), "%Y.%m.%d"))

            # Updated can have more than one space after it. <shrug>
            m = re.match(r".*?Last Updated: ([0-9\.]+).*?", metastr)
            if m:
                self.story.setMetadata('dateUpdated',
                                       makeDate(m.group(1), "%Y.%m.%d"))
コード例 #20
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url + self.METADATA_URL_SUFFIX)

        # Check if the story is for "Registered Users Only", i.e. has adult
        # content. Based on the "is_adult" attributes either login or raise an
        # error.
        errortext_div = soup.find('div', {'class': 'errortext'})
        if errortext_div:
            error_text = ''.join(errortext_div(text=True)).strip()
            if error_text == 'Registered Users Only':
                if not (self.is_adult or self.getConfig('is_adult')):
                    raise exceptions.AdultCheckRequired(self.url)
                self._login()
            else:
                # This case usually occurs when the story doesn't exist, but
                # might potentially be something else, so just raise
                # FailedToDownload exception with the found error text.
                raise exceptions.FailedToDownload(error_text)

        url = ''.join([self.url, self.METADATA_URL_SUFFIX, self.AGE_CONSENT_URL_SUFFIX])
        soup = self._customized_fetch_url(url)

        # If logged in and the skin doesn't match the required skin throw an
        # error
        if self.is_logged_in:
            skin = soup.find('select', {'name': 'skin'}).find('option', selected=True)['value']
            if skin != self.REQUIRED_SKIN:
                raise exceptions.FailedToDownload('Required skin "%s" must be set in preferences' % self.REQUIRED_SKIN)

        pagetitle_div = soup.find('div', id='pagetitle')
        self.story.setMetadata('title', pagetitle_div.a.string)

        author_anchor = pagetitle_div.a.findNextSibling('a')
        url = urlparse.urljoin(self.BASE_URL, author_anchor['href'])
        components = urlparse.urlparse(url)
        query_data = urlparse.parse_qs(components.query)

        self.story.setMetadata('author', author_anchor.string)
        self.story.setMetadata('authorId', query_data['uid'])
        self.story.setMetadata('authorUrl', url)

        sort_div = soup.find('div', id='sort')
        self.story.setMetadata('reviews', sort_div('a')[1].string)

        for b_tag in soup.find('div', {'class': 'listbox'})('b'):
            key = b_tag.string.strip(' :')
            try:
                value = b_tag.nextSibling.string.strip()
            # This can happen with some fancy markup in the summary. Just
            # ignore this error and set value to None, the summary parsing
            # takes care of this
            except AttributeError:
                value = None

            if key == 'Summary':
                contents = []
                keep_summary_html = self.getConfig('keep_summary_html')

                for sibling in _yield_next_siblings(b_tag):
                    if isinstance(sibling, BeautifulSoup.Tag):
                        # Encountered next label, break. This method is the
                        # safest and most reliable I could think of. Blame
                        # e-fiction sites that allow their users to include
                        # arbitrary markup into their summaries and the
                        # horrible HTML markup.
                        if sibling.name == 'b' and sibling.findPreviousSibling().name == 'br':
                            break

                        if keep_summary_html:
                            contents.append(self.utf8FromSoup(self.url, sibling))
                        else:
                            contents.append(''.join(sibling(text=True)))
                    else:
                        contents.append(sibling)

                # Remove the preceding break line tag and other crud
                contents.pop()
                contents.pop()
                self.story.setMetadata('description', ''.join(contents))

            elif key == 'Rating':
                self.story.setMetadata('rating', value)

            elif key == 'Category':
                for sibling in b_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break
                    self.story.addToList('category', sibling.string)

            # Seems to be always "None" for some reason
            elif key == 'Characters':
                for sibling in b_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break
                    self.story.addToList('characters', sibling.string)

            elif key == 'Series':
                a = b_tag.findNextSibling('a')
                if not a:
                    continue
                self.story.setMetadata('series', a.string)
                self.story.setMetadata('seriesUrl', urlparse.urljoin(self.BASE_URL, a['href']))

            elif key == 'Chapter':
                self.story.setMetadata('numChapters', int(value))

            elif key == 'Completed':
                self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')

            elif key == 'Words':
                self.story.setMetadata('numWords', value)

            elif key == 'Read':
                self.story.setMetadata('readings', value)

            elif key == 'Published':
                self.story.setMetadata('datePublished', makeDate(value, self.DATETIME_FORMAT))

            elif key == 'Updated':
                self.story.setMetadata('dateUpdated', makeDate(value, self.DATETIME_FORMAT))

        for b_tag in soup.find('div', id='output').findNextSiblings('b'):
            chapter_anchor = b_tag.a
            title = chapter_anchor.string
            url = urlparse.urljoin(self.BASE_URL, chapter_anchor['href'])
            self.chapterUrls.append((title, url))
コード例 #21
0
class FicBookNetAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.username = "******" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])


        # normalized story URL.
        self._setURL('https://' + self.getSiteDomain() + '/readfic/'+self.story.getMetadata('storyId'))

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev','fbn')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%d %m %Y"

    @staticmethod # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'www.ficbook.net'

    @classmethod
    def getSiteExampleURLs(cls):
        return "https://"+cls.getSiteDomain()+"/readfic/12345 https://"+cls.getSiteDomain()+"/readfic/93626/246417#part_content"

    def getSiteURLPattern(self):
        return r"https?://"+re.escape(self.getSiteDomain()+"/readfic/")+r"\d+"

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def extractChapterUrlsAndMetadata(self):
        url=self.url
        logger.debug("URL: "+url)
        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e


        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        adult_div = soup.find('div',id='adultCoverWarning')
        if adult_div:
            if self.is_adult or self.getConfig("is_adult"):
                adult_div.extract()
            else:
                raise exceptions.AdultCheckRequired(self.url)

        # Now go hunting for all the meta data and the chapter list.

        ## Title
        a = soup.find('section',{'class':'chapter-info'}).find('h1')
        # kill '+' marks if present.
        sup = a.find('sup')
        if sup:
            sup.extract()
        self.story.setMetadata('title',stripHTML(a))
        logger.debug("Title: (%s)"%self.story.getMetadata('title'))

        # Find authorid and URL from... author url.
        # assume first avatar-nickname -- there can be a second marked 'beta'.
        a = soup.find('a',{'class':'avatar-nickname'})
        self.story.setMetadata('authorId',a.text) # Author's name is unique
        self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
        self.story.setMetadata('author',a.text)
        logger.debug("Author: (%s)"%self.story.getMetadata('author'))

        # Find the chapters:
        pubdate = None
        chapters = soup.find('ul', {'class' : 'table-of-contents'})
        if chapters != None:
            chapters=chapters.findAll('a', href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+"/\d+#part_content$"))
            self.story.setMetadata('numChapters',len(chapters))
            for x in range(0,len(chapters)):
                chapter=chapters[x]
                churl='https://'+self.host+chapter['href']
                self.add_chapter(chapter,churl)
                ## First chapter doesn't always have a date, skip it.
                if pubdate == None and chapter.parent.find('span'):
                    pubdate = translit.translit(stripHTML(chapter.parent.find('span')))
                    # pubdate = translit.translit(stripHTML(self.make_soup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
                if x == len(chapters)-1:
                    update = translit.translit(stripHTML(chapter.parent.find('span')))
                    # update = translit.translit(stripHTML(self.make_soup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
        else:
            self.add_chapter(self.story.getMetadata('title'),url)
            self.story.setMetadata('numChapters',1)
            pubdate=translit.translit(stripHTML(soup.find('div',{'class':'title-area'}).find('span')))
            update=pubdate

        logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))

        if not ',' in pubdate:
            pubdate=datetime.date.today().strftime(self.dateformat)
        if not ',' in update:
            update=datetime.date.today().strftime(self.dateformat)
        pubdate=pubdate.split(',')[0]
        update=update.split(',')[0]

        fullmon = {"yanvarya":"01", u"января":"01",
           "fievralya":"02", u"февраля":"02",
           "marta":"03", u"марта":"03",
           "aprielya":"04", u"апреля":"04",
           "maya":"05", u"мая":"05",
           "iyunya":"06", u"июня":"06",
           "iyulya":"07", u"июля":"07",
           "avghusta":"08", u"августа":"08",
           "sentyabrya":"09", u"сентября":"09",
           "oktyabrya":"10", u"октября":"10",
           "noyabrya":"11", u"ноября":"11",
           "diekabrya":"12", u"декабря":"12" }

        for (name,num) in fullmon.items():
            if name in pubdate:
                pubdate = pubdate.replace(name,num)
            if name in update:
                update = update.replace(name,num)

        self.story.setMetadata('dateUpdated', makeDate(update, self.dateformat))
        self.story.setMetadata('datePublished', makeDate(pubdate, self.dateformat))
        self.story.setMetadata('language','Russian')

        ## after site change, I don't see word count anywhere.
        # pr=soup.find('a', href=re.compile(r'/printfic/\w+'))
        # pr='https://'+self.host+pr['href']
        # pr = self.make_soup(self._fetchUrl(pr))
        # pr=pr.findAll('div', {'class' : 'part_text'})
        # i=0
        # for part in pr:
        #     i=i+len(stripHTML(part).split(' '))
        # self.story.setMetadata('numWords', unicode(i))


        dlinfo = soup.find('dl',{'class':'info'})

        i=0
        fandoms = dlinfo.find('dd').findAll('a', href=re.compile(r'/fanfiction/\w+'))
        for fandom in fandoms:
            self.story.addToList('category',fandom.string)
            i=i+1
        if i > 1:
            self.story.addToList('genre', u'Кроссовер')

        for genre in dlinfo.findAll('a',href=re.compile(r'/genres/')):
            self.story.addToList('genre',stripHTML(genre))

        ratingdt = dlinfo.find('dt',text='Рейтинг:')
        self.story.setMetadata('rating', stripHTML(ratingdt.next_sibling))

        # meta=table.findAll('a', href=re.compile(r'/ratings/'))
        # i=0
        # for m in meta:
        #     if i == 0:
        #         self.story.setMetadata('rating', stripHTML(m))
        #         i=1
        #     elif i == 1:
        #         if not "," in m.nextSibling:
        #             i=2
        #         self.story.addToList('genre', m.find('b').text)
        #     elif i == 2:
        #         self.story.addToList('warnings', m.find('b').text)

        if dlinfo.find('span', {'style' : 'color: green'}):
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')


        tags = dlinfo.findAll('dt')
        for tag in tags:
            label = translit.translit(tag.text)
            if 'Piersonazhi:' in label or u'Персонажи:' in label:
                chars=stripHTML(tag.next_sibling).split(', ')
                for char in chars:
                    self.story.addToList('characters',char)
                break

        summary=soup.find('div', {'class' : 'urlize'})
        self.setDescription(url,summary)
コード例 #22
0
    def extractChapterUrlsAndMetadata(self):
        idstr = self.story.getMetadata('storyId')
        idnum = int(idstr)
        self.do_sleep()

        if idnum >= 1000:
            logger.warn("storyId:%s - Custom INI data will be used." % idstr)

            sections = ['teststory:%s' % idstr, 'teststory:defaults']
            #print("self.get_config_list(sections,'valid_entries'):%s"%self.get_config_list(sections,'valid_entries'))
            for key in self.get_config_list(sections, 'valid_entries'):
                if key.endswith("_list"):
                    nkey = key[:-len("_list")]
                    #print("addList:%s"%(nkey))
                    for val in self.get_config_list(sections, key):
                        #print("addList:%s->%s"%(nkey,val))
                        self.story.addToList(
                            nkey,
                            val.decode('utf-8').replace('{{storyId}}', idstr))
                else:
                    # Special cases:
                    if key in ['datePublished', 'dateUpdated']:
                        self.story.setMetadata(
                            key,
                            makeDate(self.get_config(sections, key),
                                     "%Y-%m-%d"))
                    else:
                        self.story.setMetadata(
                            key,
                            self.get_config(sections,
                                            key).decode('utf-8').replace(
                                                '{{storyId}}', idstr))
                    #print("set:%s->%s"%(key,self.story.getMetadata(key)))

            self.chapterUrls = []
            if self.has_config(sections, 'chapter_urls'):
                for l in self.get_config(sections,
                                         'chapter_urls').splitlines():
                    if l:
                        self.chapterUrls.append(
                            (l[1 + l.index(','):], l[:l.index(',')]))
            else:
                for (j, chap) in enumerate(self.get_config_list(
                        sections, 'chaptertitles'),
                                           start=1):
                    self.chapterUrls.append(
                        (chap, self.url + "&chapter=%d" % j))
            # self.chapterUrls = [(u'Prologue '+self.crazystring,self.url+"&chapter=1"),
            #                 ('Chapter 1, Xenos on Cinnabar',self.url+"&chapter=2"),
            #                 ]
            self.story.setMetadata('numChapters', len(self.chapterUrls))

            return

        if idnum >= 700 and idnum <= 710:
            self._setURL('http://test1.com?sid=%s' % (idnum + 100))
            self.story.setMetadata('storyId',
                                   self.parsedUrl.query.split('=', )[1])
            idstr = self.story.getMetadata('storyId')
            idnum = int(idstr)

        if idstr == '665' and not (self.is_adult
                                   or self.getConfig("is_adult")):
            logger.warn("self.is_adult:%s" % self.is_adult)
            raise exceptions.AdultCheckRequired(self.url)

        if idstr == '666':
            raise exceptions.StoryDoesNotExist(self.url)

        if idstr.startswith('670'):
            time.sleep(1.0)

        if idstr.startswith('671'):
            time.sleep(1.0)

        if self.getConfig("username"):
            self.username = self.getConfig("username")

        if idstr == '668' and self.username != "Me":
            raise exceptions.FailedToLogin(self.url, self.username)

        if idstr == '664':
            self.story.setMetadata(
                u'title', "Test Story Title " + idstr + self.crazystring)
            self.story.setMetadata(
                'author', 'Test Author aa bare amp(&) quote(&#39;) amp(&amp;)')
        else:
            self.story.setMetadata(u'title', "Test Story Title " + idstr)
            self.story.setMetadata('author', 'Test Author aa')
        self.setDescription(
            self.url, u'<div>Description ' + self.crazystring + u''' Done
<p>
Some more longer description.  "I suck at summaries!"  "Better than it sounds!"  "My first fic"
</div>''')
        self.story.setMetadata('datePublished',
                               makeDate("1975-03-15", "%Y-%m-%d"))
        if idstr == '669':
            self.story.setMetadata('dateUpdated', datetime.datetime.now())
        else:
            self.story.setMetadata('dateUpdated',
                                   makeDate("1975-04-15", "%Y-%m-%d"))

        if idstr != '674':
            self.story.setMetadata('numWords', '123456')

        if idnum % 2 == 1:
            self.story.setMetadata('status', 'In-Progress')
        else:
            self.story.setMetadata('status', 'Completed')

        # greater than 10, no language or series.
        if idnum < 10:
            langs = {
                0: "English",
                1: "Russian",
                2: "French",
                3: "German",
            }
            self.story.setMetadata('language', langs[idnum % len(langs)])
            self.setSeries('The Great Test', idnum)
            self.story.setMetadata('seriesUrl', 'http://test1.com?seriesid=1')
        if idnum == 0:
            self.setSeries(
                "A Nook Hyphen Test " + self.story.getMetadata('dateCreated'),
                idnum)
            self.story.setMetadata('seriesUrl', 'http://test1.com?seriesid=0')

        self.story.setMetadata('rating', 'Tweenie')

        if idstr == '673':
            self.story.addToList('author', 'Author From List 1')
            self.story.addToList('author', 'Author From List 2')
            self.story.addToList('author', 'Author From List 3')
            self.story.addToList('author', 'Author From List 4')
            self.story.addToList('author', 'Author From List 5')
            self.story.addToList('author', 'Author From List 6')
            self.story.addToList('author', 'Author From List 7')
            self.story.addToList('author', 'Author From List 8')
            self.story.addToList('author', 'Author From List 9')
            self.story.addToList('author', 'Author From List 0')
            self.story.addToList('author', 'Author From List q')
            self.story.addToList('author', 'Author From List w')
            self.story.addToList('author', 'Author From List e')
            self.story.addToList('author', 'Author From List r')
            self.story.addToList('author', 'Author From List t')
            self.story.addToList('author', 'Author From List y')
            self.story.addToList('author', 'Author From List u')
            self.story.addToList('author', 'Author From List i')
            self.story.addToList('author', 'Author From List o')

            self.story.addToList('authorId', '98765-1')
            self.story.addToList('authorId', '98765-2')
            self.story.addToList('authorId', '98765-3')
            self.story.addToList('authorId', '98765-4')
            self.story.addToList('authorId', '98765-5')
            self.story.addToList('authorId', '98765-6')
            self.story.addToList('authorId', '98765-7')
            self.story.addToList('authorId', '98765-8')
            self.story.addToList('authorId', '98765-9')
            self.story.addToList('authorId', '98765-0')
            self.story.addToList('authorId', '98765-q')
            self.story.addToList('authorId', '98765-w')
            self.story.addToList('authorId', '98765-e')
            self.story.addToList('authorId', '98765-r')
            self.story.addToList('authorId', '98765-t')
            self.story.addToList('authorId', '98765-y')
            self.story.addToList('authorId', '98765-u')
            self.story.addToList('authorId', '98765-i')
            self.story.addToList('authorId', '98765-o')

            self.story.addToList('authorUrl', 'http://author/url-1')
            self.story.addToList('authorUrl', 'http://author/url-2')
            self.story.addToList('authorUrl', 'http://author/url-3')
            self.story.addToList('authorUrl', 'http://author/url-4')
            self.story.addToList('authorUrl', 'http://author/url-5')
            self.story.addToList('authorUrl', 'http://author/url-6')
            self.story.addToList('authorUrl', 'http://author/url-7')
            self.story.addToList('authorUrl', 'http://author/url-8')
            self.story.addToList('authorUrl', 'http://author/url-9')
            self.story.addToList('authorUrl', 'http://author/url-0')
            self.story.addToList('authorUrl', 'http://author/url-q')
            self.story.addToList('authorUrl', 'http://author/url-w')
            self.story.addToList('authorUrl', 'http://author/url-e')
            self.story.addToList('authorUrl', 'http://author/url-r')
            self.story.addToList('authorUrl', 'http://author/url-t')
            self.story.addToList('authorUrl', 'http://author/url-y')
            self.story.addToList('authorUrl', 'http://author/url-u')
            self.story.addToList('authorUrl', 'http://author/url-i')
            self.story.addToList('authorUrl', 'http://author/url-o')

            self.story.addToList('category', 'Power Rangers')
            self.story.addToList('category', 'SG-1')
            self.story.addToList('genre', 'P**n')
            self.story.addToList('genre', 'Drama')
        elif idnum < 1000:
            self.story.setMetadata('authorId', '98765')
            self.story.setMetadata('authorUrl', 'http://author/url')

        self.story.addToList('warnings', 'Swearing')
        self.story.addToList('warnings', 'Violence')

        if idstr == '80':
            self.story.addToList('category', u'Rizzoli &amp; Isles')
            self.story.addToList('characters', 'J. Rizzoli')
        elif idstr == '81':
            self.story.addToList('category', u'Pitch Perfect')
            self.story.addToList('characters', 'Chloe B.')
        elif idstr == '82':
            self.story.addToList('characters', 'Henry (Once Upon a Time)')
            self.story.addToList('category', u'Once Upon a Time (TV)')
        elif idstr == '83':
            self.story.addToList('category', u'Rizzoli &amp; Isles')
            self.story.addToList('characters', 'J. Rizzoli')
            self.story.addToList('category', u'Pitch Perfect')
            self.story.addToList('characters', 'Chloe B.')
            self.story.addToList('ships', 'Chloe B. &amp; J. Rizzoli')
        elif idstr == '90':
            self.story.setMetadata('characters', 'Henry (Once Upon a Time)')
            self.story.setMetadata('category', u'Once Upon a Time (TV)')
        else:
            self.story.addToList('category', 'Harry Potter')
            self.story.addToList('category', 'Furbie')
            self.story.addToList('category', 'Crossover')
            self.story.addToList('category',
                                 u'Puella Magi Madoka Magica/魔法少女まどか★マギカ')
            self.story.addToList('category', u'Magical Girl Lyrical Nanoha')
            self.story.addToList('category', u'Once Upon a Time (TV)')
            self.story.addToList('characters', 'Bob Smith')
            self.story.addToList('characters', 'George Johnson')
            self.story.addToList('characters', 'Fred Smythe')
            self.story.addToList('ships', 'Harry Potter/Ginny Weasley')
            self.story.addToList(
                'ships', 'Harry Potter/Ginny Weasley/Albus Dumbledore')
            self.story.addToList('ships',
                                 'Harry Potter &amp; Hermione Granger')

        self.story.addToList('genre', 'Fantasy')
        self.story.addToList('genre', 'Comedy')
        self.story.addToList('genre', 'Sci-Fi')
        self.story.addToList('genre', 'Noir')

        self.story.addToList('listX', 'xVal1')
        self.story.addToList('listX', 'xVal2')
        self.story.addToList('listX', 'xVal3')
        self.story.addToList('listX', 'xVal4')

        self.story.addToList('listY', 'yVal1')
        self.story.addToList('listY', 'yVal2')
        self.story.addToList('listY', 'yVal3')
        self.story.addToList('listY', 'yVal4')

        self.story.addToList('listZ', 'zVal1')
        self.story.addToList('listZ', 'zVal2')
        self.story.addToList('listZ', 'zVal3')
        self.story.addToList('listZ', 'zVal4')

        self.story.setMetadata('metaA', '98765')
        self.story.setMetadata('metaB', '01245')
        self.story.setMetadata('metaC', 'The mighty metaC!')

        self.chapterUrls = [
            (u'Prologue ' + self.crazystring, self.url + "&chapter=1"),
            ('Chapter 1, Xenos on Cinnabar', self.url + "&chapter=2"),
            ('Chapter 2, Sinmay on Kintikin', self.url + "&chapter=3"),
            ('Chapter 3, Over Cinnabar', self.url + "&chapter=4"),
            ('Chapter 4', self.url + "&chapter=5"),
            ('Chapter 5', self.url + "&chapter=6"),
            ('Chapter 6', self.url + "&chapter=7"),
            ('Chapter 7', self.url + "&chapter=8"),
            ('Chapter 8', self.url + "&chapter=9"),
            #('Chapter 9',self.url+"&chapter=0"),
            #('Chapter 0',self.url+"&chapter=a"),
            #('Chapter a',self.url+"&chapter=b"),
            #('Chapter b',self.url+"&chapter=c"),
            #('Chapter c',self.url+"&chapter=d"),
            #('Chapter d',self.url+"&chapter=e"),
            #('Chapter e',self.url+"&chapter=f"),
            #('Chapter f',self.url+"&chapter=g"),
            #('Chapter g',self.url+"&chapter=h"),
            #('Chapter h',self.url+"&chapter=i"),
            #('Chapter i',self.url+"&chapter=j"),
            #('Chapter j',self.url+"&chapter=k"),
            #('Chapter k',self.url+"&chapter=l"),
            #('Chapter l',self.url+"&chapter=m"),
            #('Chapter m',self.url+"&chapter=n"),
            #('Chapter n',self.url+"&chapter=o"),
        ]
        self.story.setMetadata('numChapters', len(self.chapterUrls))
コード例 #23
0
                        self.story.addToList('characters',char.split(' -')[0])

            if 'Warnings' in label:
                warnings = value.string.split(', ')
                for warning in warnings:
                    if 'None' not in warning:
                        self.story.addToList('warnings',warning.split(' -')[0])

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(value.split(' -')[0], self.dateformat))
            
            if 'Updated' in label:
                # there's a stray [ at the end.
                #value = value[0:-1]
                self.story.setMetadata('dateUpdated', makeDate(value.split(' -')[0], self.dateformat))

            
    # grab the text for an individual chapter.
    def getChapterText(self, url):

        logger.debug('Getting chapter text from: %s' % url)
		
        data = self._fetchUrl(url)

        soup = bs.BeautifulSoup(data, selfClosingTags=('br','hr','span','center')) # some chapters seem to be hanging up on those tags, so it is safer to close them
コード例 #24
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url + '&chapter=1')

        element = soup.find('div', id='pagetitle')
        page_title = ''.join(element(text=True)).encode(_SOURCE_CODE_ENCODING)
        if page_title == self._STORY_DOES_NOT_EXIST_PAGE_TITLE:
            raise exceptions.StoryDoesNotExist(self.url)

        author_url = urlparse.urljoin(self.url, element.a['href'])

        story_id = self.story.getMetadata('storyId')
        element = soup.find('select', {'name': 'chapter'})
        if element:
            for option in element('option'):
                title = option.string
                url = self._VIEW_CHAPTER_URL_TEMPLATE % (story_id, option['value'])
                self.chapterUrls.append((title, url))

        soup = self._customized_fetch_url(author_url)
        story_id = self.story.getMetadata('storyId')

        for listbox_div in soup('div', {'class': lambda klass: klass and 'listbox' in klass}):
            a = listbox_div.div.a
            if not a['href'].startswith('viewstory.php?sid='):
                continue

            query_data = _get_query_data(a['href'])
            if query_data['sid'] == story_id:
                break
        else:
            raise exceptions.FailedToDownload(self.url)

        title = ''.join(a(text=True))
        self.story.setMetadata('title', title)
        if not self.chapterUrls:
            self.chapterUrls.append((title, self.url))

        element = a.findNextSibling('a')
        self.story.setMetadata('author', element.string)
        query_data = _get_query_data(element['href'])
        self.story.setMetadata('authorId', query_data['uid'])
        self.story.setMetadata('authorUrl', author_url)

        element = element.findNextSibling('span')
        rating = element.nextSibling.strip(' [')

        if rating.encode(_SOURCE_CODE_ENCODING) != 'Korhatár nélkül':
            self.story.setMetadata('rating', rating)

        if rating == '18':
            raise exceptions.AdultCheckRequired(self.url)

        element = element.findNextSiblings('a')[1]
        self.story.setMetadata('reviews', element.string)

        sections = listbox_div('div', {'class': lambda klass: klass and klass in ['content', 'tail']})
        for section in sections:
            for element in section('span', {'class': 'classification'}):
                key = element.string.encode(_SOURCE_CODE_ENCODING).strip(' :')
                try:
                    value = element.nextSibling.string.encode(_SOURCE_CODE_ENCODING).strip()
                except AttributeError:
                    value = None

                if key == 'Tartalom':
                    contents = []
                    keep_summary_html = self.getConfig('keep_summary_html')

                    for sibling in _yield_next_siblings(element):
                        if isinstance(sibling, Tag):
                            if sibling.name == 'span' and sibling.get('class', None) == 'classification':
                                break

                            if keep_summary_html:
                                contents.append(self.utf8FromSoup(author_url, sibling))
                            else:
                                contents.append(''.join(sibling(text=True)))
                        else:
                            contents.append(sibling)
                    self.story.setMetadata('description', ''.join(contents))

                elif key == 'Kategória':
                    for sibling in element.findNextSiblings(['a', 'span']):
                        if sibling.name == 'span':
                            break

                        self.story.addToList('category', sibling.string)

                elif key == 'Szereplők':
                    for name in value.split(', '):
                        self.story.addToList('characters', name)

                elif key == 'Műfaj':
                    if value != 'Nincs':
                        self.story.setMetadata('genre', value)

                elif key == 'Figyelmeztetés':
                    if value != 'Nincs':
                        for warning in value.split(', '):
                            self.story.addToList('warnings', warning)

                elif key == 'Kihívás':
                    if value != 'Nincs':
                        self.story.setMetadata('challenge', value)

                elif key == 'Sorozat':
                    if value != 'Nincs':
                        self.story.setMetadata('series', value)

                elif key == 'Fejezetek':
                    self.story.setMetadata('numChapters', int(value))

                elif key == 'Befejezett':
                    self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')

                elif key == 'Szavak száma':
                    self.story.setMetadata('numWords', value)

                elif key == 'Feltöltve':
                    self.story.setMetadata('datePublished', makeDate(value, self._DATE_FORMAT))

                elif key == 'Frissítve':
                    self.story.setMetadata('dateUpdated', makeDate(value, self._DATE_FORMAT))
コード例 #25
0
    def extractChapterUrlsAndMetadata(self):
        url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')
        soup = self._customized_fetch_url(url)

        keep_summary_html = self.getConfig('keep_summary_html')
        for row in soup.find('table')('tr'):
            cells = row('td')
            key = cells[0].b.string.strip(':')
            try:
                value = cells[1].string
            except AttributeError:
                value = None

            if key == 'Title':
                self.story.setMetadata('title', value)
                self.chapterUrls.append((value, self.url))

            elif key == 'File Name':
                self.story.setMetadata('fileName', value)

            elif key == 'File Size':
                self.story.setMetadata('fileSize', value)

            elif key == 'Author':
                element = cells[1].a
                self.story.setMetadata('author', element.string)
                query_data = _get_query_data(element['href'])
                self.story.setMetadata('authorId', query_data['word'])
                self.story.setMetadata('authorUrl', urlparse.urljoin(url, element['href']))

            elif key == 'Date Added':
                try:
                    date = makeDate(value, self.DATETIME_FORMAT)
                except ValueError:
                    date = makeDate(value, self.ALTERNATIVE_DATETIME_FORMAT)
                self.story.setMetadata('datePublished', date)

            elif key == 'Old Name':
                self.story.setMetadata('oldName', value)

            elif key == 'New Name':
                self.story.setMetadata('newName', value)

            elif key == 'Other Names':
                for name in value.split(', '):
                    self.story.addToList('characters', name)

            # I have no clue how the rating system works, if you are reading
            # transgender fanfiction, you are probably an adult.
            elif key == 'Rating':
                self.story.setMetadata('rating', value)

            elif key == 'Complete':
                self.story.setMetadata('status', 'Complete' if value == 'Complete' else 'In-Progress')

            elif key == 'Categories':
                for element in cells[1]('a'):
                    self.story.addToList('category', element.string)

            elif key == 'Key Words':
                for element in cells[1]('a'):
                    self.story.addToList('keyWords', element.string)

            elif key == 'Age':
                element = cells[1].a
                self.story.setMetadata('mainCharactersAge', element.string)

            elif key == 'Synopsis':
                element = cells[1]

                # Replace td with div to avoid possible strange formatting in
                # the ebook later on
                element.name = 'div'

                if keep_summary_html:
                    self.story.setMetadata('description', unicode(element))
                else:
                    self.story.setMetadata('description', element.get_text(strip=True))

            elif key == 'Reads':
                self.story.setMetadata('readings', value)
コード例 #26
0
    def extractChapterUrlsAndMetadata(self):

        ## There is no way to tell if a fic is complete or not, so we can't set the status, which
        # will default to 'Unknown'

        url = self.url
        logger.debug("URL: "+url)

        data = self.get_page(url)

        if "<!DOCTYPE html" not in data:
            raise exceptions.StoryDoesNotExist(url)

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        # Find authorid and URL from... author url.
        a = soup.find('a', {'class':'user'})
        if a:
            self.story.setMetadata('authorId',a['href'].split('/')[-1])
            self.story.setMetadata('authorUrl','http://'+self.host+a['href']+'/fics')
            self.story.setMetadata('author',a.string)
        else:
            author = soup.find('h1').string
            author = author[author.rfind('by')+2:].strip()
            self.story.setMetadata('authorId', author)
            self.story.setMetadata('authorUrl', 'http://'+self.getSiteDomain())
            self.story.setMetadata('author', author)
            
        ## Title
        self.story.setMetadata('title',stripHTML(soup.find('h1')).replace(
            'by '+self.story.getMetadata('author'), '').strip())

        # Find the chapters:
        for chapter in soup.find('select').find_all('option', value=re.compile(
            '/'+self.story.getMetadata('storyId')+r'/\d+')):
            # just in case there's tags, like <i> in chapter titles.
            self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['value']))

        ## One chapter stories do not have a listing for the chapters, so we have to check to make
        ## sure, and if there aren't any chapterUrls, we set it to the Url entered.
        if len(self.chapterUrls) == 0:
            self.chapterUrls.append((self.story.getMetadata('title'), url))

        self.story.setMetadata('numChapters',len(self.chapterUrls))

        # Most of the metadata can be gotten from the story page, but it can all be gotten from the
        # author's fic page, so we are going to get it from there. Unless there is no author page,
        # then we have to use what we can get.
        if self.story.getMetadata('authorUrl') != 'http://'+self.getSiteDomain():
            adata = self.get_page(self.story.getMetadata('authorUrl'))
            asoup = self.make_soup(adata)

            story_found = False
            for story in asoup.find('ul', {'id':'fic_list'}).find_all('li'):
                if self.story.getMetadata('title') == stripHTML(story.a):
                    story_found = True
                    break
                else:
                    story_found = False
            
            if not story_found:
                raise exceptions.StoryDoesNotExist("Cannot find story '{}' on author's page '{}'".format(
                    url, self.story.getMetadata('authorUrl')))

            if story_found:
                self.setDescription(url, stripHTML(story.p).strip())

                # The metadata is contained in a <cite> tag, with only a bold tag and seperated by a
                # period (.).
                # It has 6 'elements'
                # 0 = Rating
                # 1 = chapters and words
                # 2 = Genre
                # 3 = Characters
                # 4 = Posted Date
                # 5 = Updated Date
                metad = stripHTML(story.cite).replace('.,', ',').split('.')
                self.story.setMetadata('rating',metad[0])
                self.story.setMetadata('numWords', metad[1].split()[2])
                self.story.setMetadata('genre',metad[2])
                self.story.setMetadata('characters',metad[3])
                # The dates have letters in them, so we have to remove them.
                date_pub = metad[4].replace('Created ','').replace('st,', ',').replace('nd,', ',').replace(
                    'rd,', ',').replace('th,', ',').strip()
                date_upd = metad[5].replace('Updated ','').replace('st,', ',').replace('nd,', ',').replace(
                    'rd,', ',').replace('th,', ',').strip()
                self.story.setMetadata('datePublished', makeDate(date_pub, self.dateformat))
                self.story.setMetadata('dateUpdated', makeDate(date_pub, self.dateformat))
#        else:

        if not self.story.getMetadata('rating'):
            # There was no author page, so we get what we can from the page
            self.setDescription(url, '>>>>>>>>>> No Summary Found <<<<<<<<<<')
            metad = soup.find('div', {'class':'info'})
            for mdata in metad.find_all('b'):
                if mdata.string == 'Rating:':
                    self.story.setMetadata('rating', mdata.next_sibling)
                elif mdata.string == 'Created:':
                    value = mdata.next_sibling.replace('st,', ',').replace('nd,', ',').replace(
                        'rd,', ',').replace('th,', ',').replace('.', '').strip()
                    self.story.setMetadata('datePublished', makeDate(value, self.dateformat))
                elif mdata.string == 'Updated:':
                    value = mdata.next_sibling.replace('st,', ',').replace('nd,', ',').replace(
                        'rd,', ',').replace('th,', ',').replace('.', '').strip()
                    self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat))

        # I'm going to add the disclaimer 
        disclaimer = soup.find('strong', {'id':'disclaimer'})
        if disclaimer:
            self.story.setMetadata('disclaimer', stripHTML(disclaimer).replace(
                'Disclaimer:', '').strip())
コード例 #27
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url)

        # Since no 404 error code we have to raise the exception ourselves.
        # A title that is just 'by' indicates that there is no author name
        # and no story title available.
        if stripHTML(soup.title) == 'by':
            raise exceptions.StoryDoesNotExist(self.url)

        for option in soup.find('select', {'name': 'chapter'}):
            title = stripHTML(option)
            url = self.READ_URL_TEMPLATE % option['value']
            self.chapterUrls.append((title, url))

        # Get the URL to the author's page and find the correct story entry to
        # scrape the metadata
        author_url = urlparse.urljoin(self.url, soup.find('a', {'class': 'headline'})['href'])
        soup = self._customized_fetch_url(author_url)

        story_no = self.story.getMetadata('storyId')
        # Ignore first list_box div, it only contains the author information
        for list_box in soup('div', {'class': 'list_box'})[1:]:
            url = list_box.find('a', {'class': 'fictitle'})['href']
            query_data = _get_query_data(url)

            # Found the div containing the story's metadata; break the loop and
            # parse the element
            if query_data['no'] == story_no:
                break
        else:
            raise exceptions.FailedToDownload(self.url)

        title_anchor = list_box.find('a', {'class': 'fictitle'})
        self.story.setMetadata('title', stripHTML(title_anchor))

        author_anchor = title_anchor.findNextSibling('a')
        self.story.setMetadata('author', stripHTML(author_anchor))
        self.story.setMetadata('authorId', _get_query_data(author_anchor['href'])['who'])
        self.story.setMetadata('authorUrl', urlparse.urljoin(self.url, author_anchor['href']))

        list_review = list_box.find('div', {'class': 'list_review'})
        reviews = stripHTML(list_review.a).split(' ', 1)[0]
        self.story.setMetadata('reviews', reviews)

        summary_div = list_box.find('div', {'class': 'list_summary'})
        if not self.getConfig('keep_summary_html'):
            summary = ''.join(summary_div(text=True))
        else:
            summary = self.utf8FromSoup(author_url, summary_div)

        self.story.setMetadata('description', summary)

        # I'm assuming this to be the category, not sure what else it could be
        first_listinfo = list_box.find('div', {'class': 'list_info'})
        self.story.addToList('category', stripHTML(first_listinfo.a))

        for list_info in first_listinfo.findNextSiblings('div', {'class': 'list_info'}):
            for b_tag in list_info('b'):
                key = b_tag.string.strip(': ')
                # Strip colons from the beginning, superfluous spaces and minus
                # characters from the end, and possibly trailing commas from
                # the warnings if only one is present
                value = b_tag.nextSibling.string.strip(': -,')

                if key == 'Genre':
                    for genre in value.split(', '):
                        # Ignore the "none" genre
                        if not genre == 'none':
                            self.story.addToList('genre', genre)

                elif key == 'Rating':
                    self.story.setMetadata('rating', value)

                elif key == 'Complete':
                    self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')

                elif key == 'Warning':
                    for warning in value.split(', '):
                        # The string here starts with ", " before the actual list
                        # of values sometimes, so check for an empty warning
                        # and ignore the "none" warning.
                        if not warning or warning == 'none':
                            continue

                        self.story.addToList('warnings', warning)

                elif key == 'Chapters':
                    self.story.setMetadata('numChapters', int(value))

                elif key == 'Words':
                    # Apparently only numChapters need to be an integer for
                    # some strange reason. Remove possible ',' characters as to
                    # not confuse the codebase down the line
                    self.story.setMetadata('numWords', value.replace(',', ''))

                elif key == 'Started':
                    self.story.setMetadata('datePublished', makeDate(value, self.STARTED_DATETIME_FORMAT))

                elif key == 'Updated':
                    date_string, period = value.rsplit(' ', 1)
                    date = makeDate(date_string, self.UPDATED_DATETIME_FORMAT)

                    # Rather ugly hack to work around Calibre's changing of
                    # Python's locale setting, causing am/pm to not be properly
                    # parsed by strptime() when using a non-english locale
                    if period == 'pm':
                        date += timedelta(hours=12)
                    self.story.setMetadata('dateUpdated', date)

        if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
            raise exceptions.AdultCheckRequired(self.url)
コード例 #28
0
 def _parse_date(self, text):
     # Strip microseconds from date
     text = re.sub(r'\.\d+\+', '+', text)
     return makeDate(text, self._dateformat)
コード例 #29
0
            queryString = urllib.urlencode(
                (
                    ("type", 3),
                    ("field", 1),
                    # need translate here for the weird accented letters
                    ("text", _latinize(title)),
                    ("search", "Search"),
                )
            )
            searchUrl = "http://%s/Story_Chapter_Search.php?%s" % (self.host, queryString)
            logger.debug("Search URL: <%s>" % searchUrl)
            searchHtml = _fix_broken_markup(self._fetchUrl(searchUrl))
            searchSoup = bs.BeautifulSoup(searchHtml)
            date = searchSoup.find(text="Updated:").nextSibling.string
            logger.debug("Last Updated: '%s'" % date)
            self.story.setMetadata("dateUpdated", makeDate(date, self.dateformat))
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

    def getChapterText(self, url):

        logger.debug("Downloading chapter <%s>" % url)

        time.sleep(0.5)
        htmldata = _fix_broken_markup(self._fetchUrl(url))
        soup = bs.BeautifulSoup(htmldata)

        # strip comments from soup
コード例 #30
0
ファイル: adapter_test1.py プロジェクト: besnef/FanFicFare
    def extractChapterUrlsAndMetadata(self):
        idstr = self.story.getMetadata('storyId')
        idnum = int(idstr)
        self.do_sleep()

        if idnum >= 1000:
            logger.warn("storyId:%s - Custom INI data will be used."%idstr)

            sections = ['teststory:%s'%idstr,'teststory:defaults']
            #print("self.get_config_list(sections,'valid_entries'):%s"%self.get_config_list(sections,'valid_entries'))
            for key in self.get_config_list(sections,'valid_entries'):
                if key.endswith("_list"):
                    nkey = key[:-len("_list")]
                    #print("addList:%s"%(nkey))
                    for val in self.get_config_list(sections,key):
                        #print("addList:%s->%s"%(nkey,val))
                        self.story.addToList(nkey,val.decode('utf-8').replace('{{storyId}}',idstr))
                else:
                    # Special cases:
                    if key in ['datePublished','dateUpdated']:
                        self.story.setMetadata(key,makeDate(self.get_config(sections,key),"%Y-%m-%d"))
                    else:
                        self.story.setMetadata(key,self.get_config(sections,key).decode('utf-8').replace('{{storyId}}',idstr))
                    #print("set:%s->%s"%(key,self.story.getMetadata(key)))

            self.chapterUrls = []
            for (j,chap) in enumerate(self.get_config_list(sections,'chaptertitles'),start=1):
                self.chapterUrls.append( (chap,self.url+"&chapter=%d"%j) )
            # self.chapterUrls = [(u'Prologue '+self.crazystring,self.url+"&chapter=1"),
            #                 ('Chapter 1, Xenos on Cinnabar',self.url+"&chapter=2"),
            #                 ]
            self.story.setMetadata('numChapters',len(self.chapterUrls))
            
            return

        if idstr == '665' and not (self.is_adult or self.getConfig("is_adult")):
            logger.warn("self.is_adult:%s"%self.is_adult)
            raise exceptions.AdultCheckRequired(self.url)

        if idstr == '666':
            raise exceptions.StoryDoesNotExist(self.url)

        if idstr.startswith('670'):
            time.sleep(1.0)
            
        if idstr.startswith('671'):
            time.sleep(1.0)
            
        if self.getConfig("username"):
            self.username = self.getConfig("username")
        
        if idstr == '668' and self.username != "Me" :
            raise exceptions.FailedToLogin(self.url,self.username)

        if idstr == '664':
            self.story.setMetadata(u'title',"Test Story Title "+idstr+self.crazystring)
            self.story.setMetadata('author','Test Author aa bare amp(&) quote(&#39;) amp(&amp;)')
        else:
            self.story.setMetadata(u'title',"Test Story Title "+idstr)
            self.story.setMetadata('author','Test Author aa')
        self.story.setMetadata('storyUrl',self.url)
        self.setDescription(self.url,u'Description '+self.crazystring+u''' Done
<p>
Some more longer description.  "I suck at summaries!"  "Better than it sounds!"  "My first fic"
''')
        self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
        if idstr == '669':
            self.story.setMetadata('dateUpdated',datetime.datetime.now())
        else:
            self.story.setMetadata('dateUpdated',makeDate("1975-04-15","%Y-%m-%d"))

        if idstr != '674':
            self.story.setMetadata('numWords','123456')

        if idnum % 2 == 1:
            self.story.setMetadata('status','In-Progress')
        else:
            self.story.setMetadata('status','Completed')

        # greater than 10, no language or series.
        if idnum < 10:
            langs = {
                0:"English",
                1:"Russian",
                2:"French",
                3:"German",
                }
            self.story.setMetadata('language',langs[idnum%len(langs)])
            self.setSeries('The Great Test',idnum)
            self.story.setMetadata('seriesUrl','http://test1.com?seriesid=1')
        if idnum == 0:
            self.setSeries("A Nook Hyphen Test "+self.story.getMetadata('dateCreated'),idnum)
            self.story.setMetadata('seriesUrl','http://test1.com?seriesid=0')
            
        self.story.setMetadata('rating','Tweenie')

        if idstr == '673':
            self.story.addToList('author','Author From List 1')
            self.story.addToList('author','Author From List 2')
            self.story.addToList('author','Author From List 3')
            self.story.addToList('author','Author From List 4')
            self.story.addToList('author','Author From List 5')
            self.story.addToList('author','Author From List 6')
            self.story.addToList('author','Author From List 7')
            self.story.addToList('author','Author From List 8')
            self.story.addToList('author','Author From List 9')
            self.story.addToList('author','Author From List 0')
            self.story.addToList('author','Author From List q')
            self.story.addToList('author','Author From List w')
            self.story.addToList('author','Author From List e')
            self.story.addToList('author','Author From List r')
            self.story.addToList('author','Author From List t')
            self.story.addToList('author','Author From List y')
            self.story.addToList('author','Author From List u')
            self.story.addToList('author','Author From List i')
            self.story.addToList('author','Author From List o')
        
            self.story.addToList('authorId','98765-1')
            self.story.addToList('authorId','98765-2')
            self.story.addToList('authorId','98765-3')
            self.story.addToList('authorId','98765-4')
            self.story.addToList('authorId','98765-5')
            self.story.addToList('authorId','98765-6')
            self.story.addToList('authorId','98765-7')
            self.story.addToList('authorId','98765-8')
            self.story.addToList('authorId','98765-9')
            self.story.addToList('authorId','98765-0')
            self.story.addToList('authorId','98765-q')
            self.story.addToList('authorId','98765-w')
            self.story.addToList('authorId','98765-e')
            self.story.addToList('authorId','98765-r')
            self.story.addToList('authorId','98765-t')
            self.story.addToList('authorId','98765-y')
            self.story.addToList('authorId','98765-u')
            self.story.addToList('authorId','98765-i')
            self.story.addToList('authorId','98765-o')
        
            self.story.addToList('authorUrl','http://author/url-1')
            self.story.addToList('authorUrl','http://author/url-2')
            self.story.addToList('authorUrl','http://author/url-3')
            self.story.addToList('authorUrl','http://author/url-4')
            self.story.addToList('authorUrl','http://author/url-5')
            self.story.addToList('authorUrl','http://author/url-6')
            self.story.addToList('authorUrl','http://author/url-7')
            self.story.addToList('authorUrl','http://author/url-8')
            self.story.addToList('authorUrl','http://author/url-9')
            self.story.addToList('authorUrl','http://author/url-0')
            self.story.addToList('authorUrl','http://author/url-q')
            self.story.addToList('authorUrl','http://author/url-w')
            self.story.addToList('authorUrl','http://author/url-e')
            self.story.addToList('authorUrl','http://author/url-r')
            self.story.addToList('authorUrl','http://author/url-t')
            self.story.addToList('authorUrl','http://author/url-y')
            self.story.addToList('authorUrl','http://author/url-u')
            self.story.addToList('authorUrl','http://author/url-i')
            self.story.addToList('authorUrl','http://author/url-o')

            self.story.addToList('category','Power Rangers')
            self.story.addToList('category','SG-1')
            self.story.addToList('genre','P**n')
            self.story.addToList('genre','Drama')
        else:
            self.story.setMetadata('authorId','98765')
            self.story.setMetadata('authorUrl','http://author/url')

        self.story.addToList('warnings','Swearing')
        self.story.addToList('warnings','Violence')

        if idstr == '80':
            self.story.addToList('category',u'Rizzoli &amp; Isles')
            self.story.addToList('characters','J. Rizzoli')
        elif idstr == '81':
            self.story.addToList('category',u'Pitch Perfect')
            self.story.addToList('characters','Chloe B.')
        elif idstr == '82':
            self.story.addToList('characters','Henry (Once Upon a Time)')        
            self.story.addToList('category',u'Once Upon a Time (TV)')
        elif idstr == '83':
            self.story.addToList('category',u'Rizzoli &amp; Isles')
            self.story.addToList('characters','J. Rizzoli')
            self.story.addToList('category',u'Pitch Perfect')
            self.story.addToList('characters','Chloe B.')
            self.story.addToList('ships','Chloe B. &amp; J. Rizzoli')
        elif idstr == '90':
            self.story.setMetadata('characters','Henry (Once Upon a Time)')        
            self.story.setMetadata('category',u'Once Upon a Time (TV)')
        else:
            self.story.addToList('category','Harry Potter')
            self.story.addToList('category','Furbie')
            self.story.addToList('category','Crossover')
            self.story.addToList('category',u'Puella Magi Madoka Magica/魔法少女まどか★マギカ')
            self.story.addToList('category',u'Magical Girl Lyrical Nanoha')
            self.story.addToList('category',u'Once Upon a Time (TV)')
            self.story.addToList('characters','Bob Smith')
            self.story.addToList('characters','George Johnson')
            self.story.addToList('characters','Fred Smythe')
            self.story.addToList('ships','Harry Potter/Ginny Weasley')
            self.story.addToList('ships','Harry Potter/Ginny Weasley/Albus Dumbledore')
            self.story.addToList('ships','Harry Potter &amp; Hermione Granger')
            
        self.story.addToList('genre','Fantasy')
        self.story.addToList('genre','Comedy')
        self.story.addToList('genre','Sci-Fi')
        self.story.addToList('genre','Noir')
                
        self.story.addToList('listX','xVal1')
        self.story.addToList('listX','xVal2')
        self.story.addToList('listX','xVal3')
        self.story.addToList('listX','xVal4')
        
        self.story.addToList('listY','yVal1')
        self.story.addToList('listY','yVal2')
        self.story.addToList('listY','yVal3')
        self.story.addToList('listY','yVal4')
        
        self.story.addToList('listZ','zVal1')
        self.story.addToList('listZ','zVal2')
        self.story.addToList('listZ','zVal3')
        self.story.addToList('listZ','zVal4')
        
        self.story.setMetadata('metaA','98765')
        self.story.setMetadata('metaB','01245')
        self.story.setMetadata('metaC','The mighty metaC!')

        self.chapterUrls = [(u'Prologue '+self.crazystring,self.url+"&chapter=1"),
                            ('Chapter 1, Xenos on Cinnabar',self.url+"&chapter=2"),
                            ('Chapter 2, Sinmay on Kintikin',self.url+"&chapter=3"),
                            ('Chapter 3, Over Cinnabar',self.url+"&chapter=4"),
                            ('Chapter 4',self.url+"&chapter=5"),
                            ('Chapter 5',self.url+"&chapter=6"),
                            ('Chapter 6',self.url+"&chapter=7"),
                            ('Chapter 7',self.url+"&chapter=8"),
                            ('Chapter 8',self.url+"&chapter=9"),
                            #('Chapter 9',self.url+"&chapter=0"),
                            #('Chapter 0',self.url+"&chapter=a"),
                            #('Chapter a',self.url+"&chapter=b"),
                            #('Chapter b',self.url+"&chapter=c"),
                            #('Chapter c',self.url+"&chapter=d"),
                            #('Chapter d',self.url+"&chapter=e"),
                            #('Chapter e',self.url+"&chapter=f"),
                            #('Chapter f',self.url+"&chapter=g"),
                            #('Chapter g',self.url+"&chapter=h"),
                            #('Chapter h',self.url+"&chapter=i"),
                            #('Chapter i',self.url+"&chapter=j"),
                            #('Chapter j',self.url+"&chapter=k"),
                            #('Chapter k',self.url+"&chapter=l"),
                            #('Chapter l',self.url+"&chapter=m"),
                            #('Chapter m',self.url+"&chapter=n"),
                            #('Chapter n',self.url+"&chapter=o"),
                            ]
        self.story.setMetadata('numChapters',len(self.chapterUrls))
コード例 #31
0
class SiyeCoUkAdapter(BaseSiteAdapter):  # XXX
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.decode = [
            "Windows-1252",
            "utf8",
        ]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # normalized story URL.
        self._setURL('http://' + self.getSiteDomain() +
                     '/siye/viewstory.php?sid=' +
                     self.story.getMetadata('storyId'))

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev', 'siye')  # XXX

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%Y.%m.%d"  # XXX

    @staticmethod  # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'www.siye.co.uk'  # XXX

    @classmethod
    def getAcceptDomains(cls):
        return ['www.siye.co.uk', 'siye.co.uk']

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://" + cls.getSiteDomain() + "/siye/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return re.escape(
            "http://") + r"(www\.)?siye\.co\.uk/(siye/)?" + re.escape(
                "viewstory.php?sid=") + r"\d+$"

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def extractChapterUrlsAndMetadata(self):

        # index=1 makes sure we see the story chapter index.  Some
        # sites skip that for one-chapter stories.
        # Except it doesn't this time. :-/
        url = self.url  #+'&index=1'+addurl
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)
        # print data

        # Now go hunting for all the meta data and the chapter list.

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
        if a is None:
            raise exceptions.StoryDoesNotExist(self.url)
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata('authorUrl',
                               'http://' + self.host + '/siye/' + a['href'])
        self.story.setMetadata('author', a.string)

        # need(or easier) to pull other metadata from the author's list page.
        authsoup = self.make_soup(
            self._fetchUrl(self.story.getMetadata('authorUrl')))

        # remove author profile incase they've put the story URL in their bio.
        profile = authsoup.find('div', {'id': 'profile'})
        if profile:  # in case it changes.
            profile.extract()

        ## Title
        titlea = authsoup.find(
            'a',
            href=re.compile(r'viewstory.php\?sid=' +
                            self.story.getMetadata('storyId') + "$"))
        self.story.setMetadata('title', stripHTML(titlea))

        # Find the chapters (from soup, not authsoup):
        for chapter in soup.findAll(
                'a',
                href=re.compile(r'viewstory.php\?sid=' +
                                self.story.getMetadata('storyId') +
                                "&chapter=\d+$")):
            # just in case there's tags, like <i> in chapter titles.
            self.chapterUrls.append(
                (stripHTML(chapter),
                 'http://' + self.host + '/siye/' + chapter['href']))

        if self.chapterUrls:
            self.story.setMetadata('numChapters', len(self.chapterUrls))
        else:
            self.chapterUrls.append((self.story.getMetadata('title'), url))
            self.story.setMetadata('numChapters', 1)

        # The stuff we can get from the chapter list/one-shot page are
        # in the first table with 95% width.
        metatable = soup.find('table', {'width': '95%'})

        # Categories
        cat_as = metatable.findAll('a', href=re.compile(r'categories.php'))
        for cat_a in cat_as:
            self.story.addToList('category', stripHTML(cat_a))

        moremetaparts = stripHTML(metatable).split('\n')
        for part in moremetaparts:
            part = part.strip()
            if part.startswith("Characters:"):
                part = part[part.find(':') + 1:]
                for item in part.split(','):
                    if item.strip() == "Harry/Ginny":
                        self.story.addToList('characters', "Harry")
                        self.story.addToList('characters', "Ginny")
                    elif item.strip() not in ("None", "All"):
                        self.story.addToList('characters', item)

            if part.startswith("Genres:"):
                part = part[part.find(':') + 1:]
                for item in part.split(','):
                    if item.strip() != "None":
                        self.story.addToList('genre', item)

            if part.startswith("Warnings:"):
                part = part[part.find(':') + 1:]
                for item in part.split(','):
                    if item.strip() != "None":
                        self.story.addToList('warnings', item)

            if part.startswith("Rating:"):
                part = part[part.find(':') + 1:]
                self.story.setMetadata('rating', part)

            if part.startswith("Summary:"):
                part = part[part.find(':') + 1:]
                self.setDescription(url, part)
                #self.story.setMetadata('description',part)

        # want to get the next tr of the table.
        #print("%s"%titlea.parent.parent.findNextSibling('tr'))

        # eFiction sites don't help us out a lot with their meta data
        # formating, so it's a little ugly.

        # SIYE formats stories in the author list differently when their part of a series.
        # Look for non-series...
        divdesc = titlea.parent.parent.find('div', {'class': 'desc'})
        if not divdesc:
            # ... now look for series.
            divdesc = titlea.parent.parent.findNextSibling('tr').find(
                'div', {'class': 'desc'})

        moremeta = stripHTML(divdesc)
        #print("moremeta:%s"%moremeta)
        for part in moremeta.replace(' - ', '\n').split('\n'):
            #print("part:%s"%part)
            try:
                (name, value) = part.split(': ')
            except:
                # not going to worry about fancier processing for the bits
                # that don't match.
                continue
            name = name.strip()
            value = value.strip()
            if name == 'Published':
                self.story.setMetadata('datePublished',
                                       makeDate(value, self.dateformat))
            if name == 'Updated':
                self.story.setMetadata('dateUpdated',
                                       makeDate(value, self.dateformat))
            if name == 'Completed':
                if value == 'Yes':
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')
            if name == 'Words':
                self.story.setMetadata('numWords', value)

        try:
            # Find Series name from series URL.
            a = titlea.findPrevious(
                'a', href=re.compile(r"series.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            storyas = seriessoup.findAll(
                'a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i = 1
            for a in storyas:
                if a['href'] == ('viewstory.php?sid=' +
                                 self.story.getMetadata('storyId')):
                    self.setSeries(series_name, i)
                    self.story.setMetadata('seriesUrl', series_url)
                    break
                i += 1

        except:
            # I find it hard to care if the series parsing fails
            pass
コード例 #32
0
                    BtVS = False
                if 'BtVS/AtS Non-Crossover' == cat.string:
                    BtVSNonX = True

        verticaltabletds = verticaltable.findAll('td')
        self.story.setMetadata('rating', verticaltabletds[2].string)
        self.story.setMetadata('numWords', verticaltabletds[4].string)

        # Complete--if completed.
        if 'Yes' in verticaltabletds[10].string:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        #print("date:%s"%verticaltabletds[8])
        self.story.setMetadata('datePublished',makeDate(stripHTML(verticaltabletds[8].string), self.dateformat))
        self.story.setMetadata('dateUpdated',makeDate(stripHTML(verticaltabletds[9].string), self.dateformat))

        for icon in storydiv.find('span',{'class':'storyicons'}).findAll('img'):
            if( icon['title'] not in ['Non-Crossover'] ) :
                self.story.addToList('genre',icon['title'])
            else:
                if not BtVSNonX:
                    BtVS = False # Don't add BtVS if Non-Crossover, unless it's a BtVS/AtS Non-Crossover

        #print("BtVS: %s BtVSNonX: %s"%(BtVS,BtVSNonX))
        if BtVS:
            self.story.addToList('category','Buffy: The Vampire Slayer')

        pseries = soup.find('p', {'style':'margin-top:0px'})
        #print("pseries:%s"%pseries.get_text())
コード例 #33
0
        metatext = stripHTML(smalldiv)

        if 'Completed: Yes' in metatext:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        wordstart = metatext.rindex('Word count:') + 12
        words = metatext[wordstart:metatext.index(' ', wordstart)]
        self.story.setMetadata('numWords', words)

        datesdiv = soup.find('div', {'class': 'bottom'})
        dates = stripHTML(datesdiv).split()
        # Published: 04/26/2011 Updated: 03/06/2013
        self.story.setMetadata('datePublished',
                               makeDate(dates[1], self.dateformat))
        self.story.setMetadata('dateUpdated',
                               makeDate(dates[3], self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a',
                          href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll(
                'a', href=re.compile(r'viewstory.php\?sid=\d+'))
コード例 #34
0
                raise exceptions.StoryDoesNotExist(authorurl)
            else:
                raise e

        ## Find link to url in author's page
        ## site has started using //domain.name/asdf urls remove https?: from front
        ## site has started putting https back on again.
        storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:])))
#         storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])

        if storyLink is not None:
            # pull the published date from the author page
            # default values from single link.  Updated below if multiple chapter.
            logger.debug("Found story on the author page.")
            date = storyLink.parent.parent.findAll('td')[-1].text
            self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
            self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))

        if storyLink is not None:
            urlTr = storyLink.parent.parent
            if "sl" in urlTr['class']:
                isSingleStory = False
            else:
                isSingleStory = True
        else:
            raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))

        if isSingleStory:
#             self.chapterUrls = [(soup1.h1.string, self.url)]
#             self.story.setMetadata('title', soup1.h1.string)
コード例 #35
0
class WWWArea52HKHNetAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.username = "******"  # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult = False

        # Getting the storyId from url - http://www.area52hkh.net/[Folder]/[AuthorID]/[STORYID].php
        # I'm setting these variables here, because I use them later.
        self.folder = self.parsedUrl.path.split('/', )[1]
        self.authorId = self.parsedUrl.path.split('/', )[2]
        self.storyId = self.parsedUrl.path.split('/', )[3].replace(
            '.php', '').replace('.htm', '').replace('.html', '')
        self.extension = self.parsedUrl.path.split('.')[1]

        self.story.setMetadata('storyId', self.storyId)
        self.story.setMetadata('authorId', self.authorId)

        # normalized story URL.
        self._setURL('http://{0}/{1}/{2}/{3}.{4}'.format(
            self.getSiteDomain(), self.folder,
            self.story.getMetadata('authorId'),
            self.story.getMetadata('storyId'), self.extension))

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev', 'a52hkh')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%b %d, %Y"

    @staticmethod  # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'www.area52hkh.net'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://www.area52hkh.net/[folder]/[author]/[story].php"

    def getSiteURLPattern(self):
        #        return r"http(s)?://www\.lushstories\.com/stories/(?P<category>[^/]+)/(?P<id>\S+)\.aspx"
        return r"http://www\.area52hkh\.net/as([a-z])/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)\.(php|htm|html)"
#        return r"http://www\.area52hkh\.net/as([a-z])/(?P<author>[^/]+)/([a-zA-Z0-9_-]+)\.php"

## Getting the chapter list and the meta data, plus 'is adult' checking.

    def extractChapterUrlsAndMetadata(self):

        url = self.url
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        # Now go hunting for all the meta data and the chapter list.

        ## Title and Series
        if self.extension == 'htm':
            raise exceptions.StoryDoesNotExist(
                'This story is in a format that has not been coded yet.')

        elif self.extension == 'php':
            a = soup.find('h1')
            self.story.setMetadata('title', stripHTML(a))

            # Find authorid and URL from... author url.
            a = soup.find('a', href=re.compile(r"/author.php\?name=\S+"))
            self.story.setMetadata('authorUrl',
                                   'http://' + self.host + '/' + a['href'])
            self.story.setMetadata('author', a.string)

            # There is only one 'chapter' for each story, so we go with the self.url
            # and the title of the story for the heading
            self.chapterUrls.append((self.story.getMetadata('title'), url))

            self.story.setMetadata('numChapters', len(self.chapterUrls))

            storya = None
            authsoup = None
            storyblock = None
            authurl = self.story.getMetadata('authorUrl')

            ## author can have more than one page of stories.

            while storyblock == None:
                ## Here is a sample of one of the storyblocks
                #<div class="story">
                #<p class="title"><a href="/[folder]/[author]/[story].php" target="story">Story Title</a> &nbsp;&nbsp;&nbsp;<i>Part:</i> 1/3 of Series Title</p>
                #<table>
                #    <tr>
                #        <td class="image"><img src="/_images/show_s.gif" class="icon" alt="SG1" title="SG1"><br></td>
                #        <td class="detail">
                #            <i>Date Archived:</i> [Published Date]<br>
                #            <i>Pairing:</i> [Ships]<br>
                #            <i>Categories:</i> [Categories]<br>
                #            <i>Season/Episode:</i>[Season]
                #        </td>
                #        <td class="detail">
                #            <i>Size:</i> [Size]  <br>
                #            <img src="/_images/info.gif" class="icon" alt="More Info" title="More Info" onmouseover="swap(12575,'block');" onmouseout="swap(12575,'none');">
                #            <div class="info" id="i12575">
                #                [[[Text Written here]]]
                #            </div><br>
                #            <i>Rating:</i> [Rating]<br>
                #            <i>Warnings:</i> [warnings]<br>
                #            <i>Spoilers:</i> [spoilers]
                #        </td>
                #    </tr>
                #</table>
                #<p class="summary"><i>Summary:</i> [Summary]</p>
                #</div>

                # no storya, but do have authsoup--we're looping on author pages.
                if authsoup != None:
                    # last author link with offset should be the 'Next' link.
                    nextpage = authsoup.find('div', {
                        'id': 'links'
                    }).find('a', {'title': 'Next'})
                    authurl = u'http://%s/%s' % (self.getSiteDomain(),
                                                 nextpage['href'])

                # Need author page for most of the metadata.
                logger.debug("fetching author page: (%s)" % authurl)
                authsoup = self.make_soup(self._fetchUrl(authurl))

                storyas = authsoup.findAll(
                    'a',
                    href=re.compile(r'/' + self.folder + '/' +
                                    self.story.getMetadata('authorId') + '/' +
                                    self.story.getMetadata('storyId') +
                                    '.php'))
                for storya in storyas:
                    storyblock = storya.findParent('div', {'class': 'story'})
                    if storyblock != None:
                        continue

            #checking to see if it is part of a series/bigger story
            series = storyblock.find('p', {'class': 'title'})

            #Some storyblocks have images, which interfers with the retreival of the metadata, so I
            # am going to remove it.
            for tag in storyblock.find_all('img'):
                tag.extract()

            #Remove the title link, since we already have the title above
            series.find('a').extract()

            ## I've seen a non-breaking space in some of the storyblocks
            ## so we are going to remove them.
            series = stripHTML(
                str(series.renderContents()).replace(b"\xc2\xa0", '')).strip()
            if len(series) > 0:
                self.story.setMetadata('series', series)

            ## Now we get the rest of the metadata
            ### some details have an imbedded div for extra info from the author
            ### this is being extracted, and put into a Metadata item called 'authorinfo'
            infodiv = storyblock.find('div', {'class': 'info'})
            if infodiv != None:
                self.story.setMetadata('authorinfo', stripHTML(infodiv))
                infodiv.extract()

            details = storyblock.findAll('i')
            for detail in details:
                detail_text = stripHTML(detail)
                value = detail.nextSibling
                value_text = value.string.strip()
                if 'Date Archived' in detail_text:
                    self.story.setMetadata(
                        'datePublished', makeDate(value_text, self.dateformat))
                    self.story.setMetadata(
                        'dateUpdated', makeDate(value_text, self.dateformat))
                elif 'Pairing' in detail_text:
                    self.story.setMetadata('ships', value_text)
                elif 'Categories' in detail_text:
                    self.story.setMetadata('category', value_text)
                elif 'Season/Episode' in detail_text:
                    self.story.setMetadata('season', value_text)
                elif 'Size' in detail_text:
                    self.story.setMetadata('size', value_text)
                elif 'Rating' in detail_text:
                    self.story.setMetadata('rating', value_text)
                elif 'Warnings' in detail_text:
                    self.story.setMetadata('warnings', value_text)
                elif 'Spoilers' in detail_text:
                    if value_text != 'None':
                        self.story.setMetadata('spoilers', value_text)
                elif 'Summary' in detail_text:
                    if not self.getConfig("keep_summary_html"):
                        value = stripHTML(value).replace('Summary:',
                                                         '').strip()
                    else:
                        value = str(value).replace('<i>Summary:</i>',
                                                   '').strip()
                    self.setDescription(url, value)
コード例 #36
0
                for genre in genres:
                    self.story.addToList('genre',genre.string)

            if 'Warnings' in label:
                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
                for warning in warnings:
                    self.story.addToList('warnings',warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value.nextSibling).split(' |')[0], self.dateformat))
            
            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://'+self.host+'/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
コード例 #37
0
                '{0} says: Invalid authorid'.format(self.getSiteDomain()))

        asoup = self.make_soup(adata)

        lc2 = asoup.find('a',
                         href=re.compile(r'read.php\?storyid=' +
                                         self.story.getMetadata('storyId')))
        lc2 = lc2.findPrevious('table')
        summry = stripHTML(lc2.find('td',
                                    {'class': 'highlightcolor2'})).strip()
        self.setDescription(url, summry)

        lupdt = lc2.findAll('td',
                            {'class': 'highlightcolor1'})[1].string.replace(
                                'Last updated', '').strip()
        self.story.setMetadata('dateUpdated', makeDate(lupdt, self.dateformat))

        self._setURL('http://' + self.getSiteDomain() + '/read.php?storyid=' +
                     self.story.getMetadata('storyId') + '&chapno=1')
        ## and that is all of the metadata that is on this site...

    # grab the text for an individual chapter.
    def getChapterText(self, url):

        logger.debug('Getting chapter text from: %s' % url)

        soup = self.make_soup(self._fetchUrl(url))

        chap = soup.find('td', {'class': 'content_pane'})

        if chap == None:
コード例 #38
0
class TheAlphaGateComAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.decode = ["Windows-1252",
                       "utf8"]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.
        self.username = "******"  # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult = False

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # normalized story URL.
        self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid=' +
                     self.story.getMetadata('storyId'))

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev', 'tag')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%d %b %Y"

    @staticmethod  # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'www.thealphagate.com'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://" + cls.getSiteDomain() + "/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return re.escape("http://" + self.getSiteDomain() +
                         "/viewstory.php?sid=") + r"\d+$"

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def extractChapterUrlsAndMetadata(self):

        # index=1 makes sure we see the story chapter index.  Some
        # sites skip that for one-chapter stories.
        url = self.url + '&index=1'
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
            raise exceptions.AccessDenied(
                self.getSiteDomain() +
                " says: Access denied. This story has not been validated by the adminstrators of this site."
            )

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)
        # print data

        # Now go hunting for all the meta data and the chapter list.

        ## Title
        a = soup.find('a',
                      href=re.compile(r'viewstory.php\?sid=' +
                                      self.story.getMetadata('storyId') + "$"))
        self.story.setMetadata('title', stripHTML(a))

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata('authorUrl',
                               'http://' + self.host + '/' + a['href'])
        self.story.setMetadata('author', a.string)

        # Find the chapters:
        for chapter in soup.findAll(
                'a',
                href=re.compile(r'viewstory.php\?sid=' +
                                self.story.getMetadata('storyId') +
                                "&chapter=\d+$")):
            # just in case there's tags, like <i> in chapter titles.
            self.chapterUrls.append(
                (stripHTML(chapter),
                 'http://' + self.host + '/' + chapter['href']))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        # eFiction sites don't help us out a lot with their meta data
        # formating, so it's a little ugly.

        # utility method
        def defaultGetattr(d, k):
            try:
                return d[k]
            except:
                return ""

        # <span class="label">Rated:</span> NC-17<br /> etc
        labels = soup.findAll('span', {'class': 'label'})
        for labelspan in labels:
            value = labelspan.nextSibling
            label = labelspan.string

            if 'Summary' in label:
                ## Everything until the next span class='label'
                svalue = ""
                while 'label' not in defaultGetattr(value, 'class'):
                    svalue += unicode(value)
                    value = value.nextSibling
                self.setDescription(url, svalue)
                #self.story.setMetadata('description',stripHTML(svalue))

            if 'Rated' in label:
                self.story.setMetadata('rating', value)

            if 'Word count' in label:
                self.story.setMetadata('numWords', value)

            if 'Categories' in label:
                cats = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=categories'))
                for cat in cats:
                    self.story.addToList('category', cat.string)

            if 'Characters' in label:
                chars = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=characters'))
                for char in chars:
                    self.story.addToList('characters', char.string)

            if 'Genre' in label:
                genres = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class&type_id=1'))
                for genre in genres:
                    self.story.addToList('genre', genre.string)

            if 'Warnings' in label:
                warnings = labelspan.parent.findAll(
                    'a', href=re.compile(
                        r'browse.php\?type=class&type_id=2'))  # XXX
                for warning in warnings:
                    self.story.addToList('warnings', warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata(
                    'datePublished', makeDate(stripHTML(value),
                                              self.dateformat))

            if 'Updated' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a',
                          href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            storyas = seriessoup.findAll(
                'a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i = 1
            for a in storyas:
                if a['href'] == ('viewstory.php?sid=' +
                                 self.story.getMetadata('storyId')):
                    self.setSeries(series_name, i)
                    self.story.setMetadata('seriesUrl', series_url)
                    break
                i += 1

        except:
            # I find it hard to care if the series parsing fails
            pass
コード例 #39
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url + self.METADATA_URL_SUFFIX)

        errortext_div = soup.find('div', {'class': 'errortext'})
        if errortext_div:
            error_text = ''.join(errortext_div(text=True)).strip()
            if error_text == self.STORY_DOES_NOT_EXIST_ERROR_TEXT:
                raise exceptions.StoryDoesNotExist(self.url)

        # No additional login is required, just check for adult
        pagetitle_div = soup.find('div', id='pagetitle')
        if pagetitle_div.a['href'].startswith('javascript:'):
            if not(self.is_adult or self.getConfig('is_adult')):
                raise exceptions.AdultCheckRequired(self.url)

        url = ''.join([self.url, self.METADATA_URL_SUFFIX, self.AGE_CONSENT_URL_SUFFIX])
        soup = self._customized_fetch_url(url)

        pagetitle_div = soup.find('div', id='pagetitle')
        self.story.setMetadata('title', stripHTML(pagetitle_div.a))

        author_anchor = pagetitle_div.a.findNextSibling('a')
        url = urlparse.urljoin(self.BASE_URL, author_anchor['href'])
        components = urlparse.urlparse(url)
        query_data = urlparse.parse_qs(components.query)

        self.story.setMetadata('author', stripHTML(author_anchor))
        self.story.setMetadata('authorId', query_data['uid'][0])
        self.story.setMetadata('authorUrl', url)

        sort_div = soup.find('div', id='sort')
        self.story.setMetadata('reviews', stripHTML(sort_div('a')[1]))

        listbox_tag = soup.find('div', {'class': 'listbox'})
        for span_tag in listbox_tag('span'):
            key = span_tag.string
            if key:
                key = key.strip(' :')
            try:
                value = stripHTML(span_tag.nextSibling)
            # This can happen with some fancy markup in the summary. Just
            # ignore this error and set value to None, the summary parsing
            # takes care of this
            except AttributeError:
                value = None

            if key == 'Summary':
                contents = []
                keep_summary_html = self.getConfig('keep_summary_html')

                for sibling in _yield_next_siblings(span_tag):
                    if isinstance(sibling, Tag):
                        # Encountered next label, break. Not as bad as other
                        # e-fiction sites, let's hope this is enough for proper
                        # parsing.
                        if sibling.name == 'span' and sibling.get('class', None) == 'label':
                            break

                        if keep_summary_html:
                            contents.append(self.utf8FromSoup(self.url, sibling))
                        else:
                            contents.append(''.join(sibling(text=True)))
                    else:
                        contents.append(sibling)

                # Remove the preceding break line tag and other crud
                if contents:
                    contents.pop()
                if contents:
                    contents.pop()
                self.story.setMetadata('description', ''.join(contents))

            elif key == 'Rated':
                self.story.setMetadata('rating', value)

            elif key == 'Categories':
                for sibling in span_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break

                    self.story.addToList('category', stripHTML(sibling))

            # Seems to be always "None" for some reason
            elif key == 'Characters':
                for sibling in span_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break
                    self.story.addToList('characters', stripHTML(sibling))

            elif key == 'Genres':
                for sibling in span_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break

                    self.story.addToList('genre', stripHTML(sibling))

            elif key == 'Warnings':
                for sibling in span_tag.findNextSiblings(['a', 'br']):
                    if sibling.name == 'br':
                        break
                    self.story.addToList('warnings', stripHTML(sibling))

            # Challenges

            elif key == 'Series':
                a = span_tag.findNextSibling('a')
                if not a:
                    continue
                self.story.setMetadata('series', stripHTML(a))
                self.story.setMetadata('seriesUrl', urlparse.urljoin(self.BASE_URL, a['href']))

            elif key == 'Chapters':
                self.story.setMetadata('numChapters', int(value))

            elif key == 'Completed':
                self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')

            elif key == 'Word count':
                self.story.setMetadata('numWords', value)

            elif key == 'Published':
                self.story.setMetadata('datePublished', makeDate(value, self.DATETIME_FORMAT))

            elif key == 'Updated':
                self.story.setMetadata('dateUpdated', makeDate(value, self.DATETIME_FORMAT))

        for p_tag in listbox_tag.findNextSiblings('p'):
            chapter_anchor = p_tag.find('a', href=lambda href: href and href.startswith('viewstory.php?sid='))
            if not chapter_anchor:
                continue

            title = stripHTML(chapter_anchor)
            url = urlparse.urljoin(self.BASE_URL, chapter_anchor['href'])
            self.chapterUrls.append((title, url))
コード例 #40
0
class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'pns')
        self.decode = ["Windows-1252",
                       "utf8"]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # normalized story URL.
        self._setURL('http://' + self.getSiteDomain() +
                     '/fanfiction/viewstory.php?sid=' +
                     self.story.getMetadata('storyId'))

    @staticmethod
    def getSiteDomain():
        return 'www.potionsandsnitches.org'

    @classmethod
    def getAcceptDomains(cls):
        return ['potionsandsnitches.org', 'potionsandsnitches.net']

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://www.potionsandsnitches.org/fanfiction/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return re.escape(
            "http://"
        ) + r"(www\.)?potionsandsnitches\.(net|org)/fanfiction/viewstory\.php\?sid=\d+$"

    def extractChapterUrlsAndMetadata(self):

        url = self.url + '&index=1'
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
            raise exceptions.AccessDenied(
                self.getSiteDomain() +
                " says: Access denied. This story has not been validated by the adminstrators of this site."
            )

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        ## Title
        a = soup.find('a',
                      href=re.compile(r'viewstory.php\?sid=' +
                                      self.story.getMetadata('storyId') + "$"))
        self.story.setMetadata('title', stripHTML(a))

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata(
            'authorUrl', 'http://' + self.host + '/fanfiction/' + a['href'])
        self.story.setMetadata('author', a.string)

        # Find the chapters:
        for chapter in soup.findAll(
                'a',
                href=re.compile(r'viewstory.php\?sid=' +
                                self.story.getMetadata('storyId') +
                                "&chapter=\d+$")):
            # just in case there's tags, like <i> in chapter titles.
            self.chapterUrls.append(
                (stripHTML(chapter),
                 'http://' + self.host + '/fanfiction/' + chapter['href']))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        def defaultGetattr(d, k):
            try:
                return d[k]
            except:
                return ""

        # <span class="label">Rated:</span> NC-17<br /> etc
        labels = soup.findAll('span', {'class': 'label'})
        for labelspan in labels:
            value = labelspan.nextSibling
            label = labelspan.string

            if 'Summary' in label:
                ## Everything until the next div class='listbox'
                svalue = ""
                while 'listbox' not in defaultGetattr(value, 'class'):
                    svalue += unicode(value)
                    value = value.nextSibling
                self.setDescription(url, svalue)
                #self.story.setMetadata('description',stripHTML(svalue))

            if 'Rated' in label:
                self.story.setMetadata('rating', value)

            if 'Word count' in label:
                self.story.setMetadata('numWords', value)

            if 'Read' in label:
                self.story.setMetadata('reads', value)

            if 'Categories' in label:
                cats = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=categories'))
                catstext = [cat.string for cat in cats]
                for cat in catstext:
                    self.story.addToList('category', cat.string)

            if 'Characters' in label:
                chars = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=characters'))
                charstext = [char.string for char in chars]
                for char in charstext:
                    if "Snape and Harry (required)" in char:
                        self.story.addToList('characters', "Snape")
                        self.story.addToList('characters', "Harry")
                    else:
                        self.story.addToList('characters', char.string)

            if 'Warning' in label:
                warnings = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class'))
                for warning in warnings:
                    self.story.addToList('warnings', stripHTML(warning))

            if 'Genre' in label:
                genres = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class'))
                for genre in genres:
                    self.story.addToList('genre', stripHTML(genre))

            if 'Takes Place' in label:
                takesplaces = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class'))
                for takesplace in takesplaces:
                    self.story.addToList('takesplaces', stripHTML(takesplace))

            if 'Snape flavour' in label:
                snapeflavours = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class'))
                for snapeflavour in snapeflavours:
                    self.story.addToList('snapeflavours',
                                         stripHTML(snapeflavour))

            if 'Tags' in label:
                sitetags = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class'))
                for sitetag in sitetags:
                    self.story.addToList('sitetags', stripHTML(sitetag))

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                # limit date values, there's some extra chars.
                self.story.setMetadata(
                    'datePublished', makeDate(stripHTML(value[:12]),
                                              "%d %b %Y"))

            if 'Updated' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value[:12]), "%d %b %Y"))

        try:
            # Find Series name from series URL.
            a = soup.find('a',
                          href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/fanfiction/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            storyas = seriessoup.findAll(
                'a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i = 1
            for a in storyas:
                if a['href'] == ('viewstory.php?sid=' +
                                 self.story.getMetadata('storyId')):
                    self.setSeries(series_name, i)
                    self.story.setMetadata('seriesUrl', series_url)
                    break
                i += 1

        except:
            # I find it hard to care if the series parsing fails
            pass

        divsort = soup.find('div', id='sort')
        stars = len(divsort.find_all('img', src='images/star.gif'))
        stars = stars + 0.5 * len(
            divsort.find_all('img', src='images/starhalf.gif'))
        self.story.setMetadata('stars', stars)

        a = divsort.find_all(
            'a',
            href=re.compile(r'reviews.php\?type=ST&(amp;)?item=' +
                            self.story.getMetadata('storyId') +
                            "$"))[1]  # second one.
        self.story.setMetadata('reviews', stripHTML(a))
コード例 #41
0
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'mm')
        self.decode = ["Windows-1252",
                       "utf8"]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.

        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(), url)
        urltitle = 'urltitle'
        cattitle = 'cattitle'
        if m:
            if m.group('id1'):
                self.story.setMetadata('storyId', m.group('id1'))
                urltitle = m.group('urltitle1')
            elif m.group('id2'):
                self.story.setMetadata('storyId', m.group('id2'))
                urltitle = m.group('urltitle2')
            elif m.group('id3'):
                self.story.setMetadata('storyId', m.group('id3'))
            elif m.group('id4'):
                self.story.setMetadata('storyId', m.group('id4'))
                cattitle = m.group('cattitle4')
                urltitle = m.group('urltitle4')
            elif m.group('id5'):
                self.story.setMetadata('storyId', m.group('id5'))
                cattitle = m.group('cattitle5')
                urltitle = m.group('urltitle5')
            else:
                raise InvalidStoryURL(url, self.getSiteDomain(),
                                      self.getSiteExampleURLs())

            # normalized story URL.
            self._setURL('http://' + self.getSiteDomain() + '/fanfic/s/' +
                         cattitle + '/' + urltitle + '/' +
                         self.story.getMetadata('storyId'))
        else:
            raise exceptions.InvalidStoryURL(url, self.getSiteDomain(),
                                             self.getSiteExampleURLs())

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%B %d, %Y %H:%M"

    @staticmethod
    def getSiteDomain():
        return 'www.mediaminer.org'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://" + cls.getSiteDomain(
        ) + "/fanfic/s/category-name/story-title/123456 http://" + cls.getSiteDomain(
        ) + "/fanfic/c/category-name/story-title/123456/987612"

    def getSiteURLPattern(self):
        ## old urls
        ## http://www.mediaminer.org/fanfic/view_st.php/76882
        ## new urls
        ## http://www.mediaminer.org/fanfic/s/ghosts-from-the-past/72
        ## http://www.mediaminer.org/fanfic/c/ghosts-from-the-past/chapter-2/72/174
        ## http://www.mediaminer.org/fanfic/s/robtech-final-missions/61553
        ## http://www.mediaminer.org/fanfic/c/robtech-final-missions/robotech-final-missions-oneshot/61553/189830
        ## even newer urls
        ## http://www.mediaminer.org/fanfic/s/gundam-wing-fan-fiction/the-preventer-operatives/171000
        ## http://www.mediaminer.org/fanfic/c/gundam-wing-fan-fiction/the-preventer-operatives/171000/608822
        return re.escape("http://"+self.getSiteDomain())+r"/fanfic/"+\
            r"((s/(?P<cattitle4>[^/]+)/(?P<urltitle4>[^/]+)/(?P<id4>\d+))|"+\
            r"((c/(?P<cattitle5>[^/]+)/(?P<urltitle5>[^/]+)/(?P<id5>\d+))/\d+)|"+\
            r"(s/(?P<urltitle1>[^/]+)/(?P<id1>\d+))|"+\
            r"((c/(?P<urltitle2>[^/]+)/[^/]+/(?P<id2>\d+))/\d+)|"+\
            r"(view_st\.php/(?P<id3>\d+)))"

    # Override stripURLParameters so the id parameter won't get stripped
    @classmethod
    def stripURLParameters(cls, url):
        return url

    def extractChapterUrlsAndMetadata(self):

        url = self.url
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(
                url
            )  # w/o trailing / gets 'chapter list' page even for one-shots.
        except urllib2.HTTPError, e:
            if e.code == 404:
                logger.error("404 on %s" % url)
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        ## title:
        ## <h1 id="post-title">A, A' Fan Fiction &#10095; Mmmmm</h1>
        titletext = stripHTML(soup.find("h1", {"id": "post-title"}))
        titletext = titletext[titletext.index(u'❯') + 2:]
        # print("title:(%s)"%titletext)
        self.story.setMetadata('title', titletext)

        # [ A - All Readers ], strip '[ ' ' ]'
        ## Above title because we remove the smtxt font to get title.
        smtxt = soup.find("div", {"id": "post-rating"})
        if not smtxt:
            logger.error("can't find rating")
            raise exceptions.StoryDoesNotExist(self.url)
        else:
            rating = smtxt.string[2:-2]
            self.story.setMetadata('rating', rating)

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"/user_info.php/\d+"))
        self.story.setMetadata('authorId', a['href'].split('/')[-1])
        self.story.setMetadata('authorUrl', 'http://' + self.host + a['href'])
        self.story.setMetadata('author', a.string)

        # save date from first for later.
        firstdate = None

        # Find the chapters - one-shot now have chapter list, too.
        chap_p = soup.find('p', {'style': 'margin-left:10px;'})
        for (atag, aurl, name) in [(x, x['href'], stripHTML(x))
                                   for x in chap_p.find_all('a')]:
            self.chapterUrls.append((name, 'http://' + self.host + aurl))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        # category
        # <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
        for a in soup.findAll('a', href=re.compile(r"^/fanfic/a/")):
            self.story.addToList('category', a.string)

        # genre
        # <a href="/fanfic/src.php/g/567">Ranma 1/2</a>
        for a in soup.findAll('a', href=re.compile(r"^/fanfic/src.php/g/")):
            self.story.addToList('genre', a.string)

        metastr = stripHTML(soup.find("div", {"class": "post-meta"}))

        # Latest Revision: February 07, 2015 15:21 PST
        m = re.match(
            r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d \d\d:\d\d)",
            metastr)
        if m:
            self.story.setMetadata('dateUpdated',
                                   makeDate(m.group(1), self.dateformat))
            # site doesn't give date published on index page.
            # set to updated, change in chapters below.
            # self.story.setMetadata('datePublished',
            #                        self.story.getMetadataRaw('dateUpdated'))

        # Words: 123456
        m = re.match(r".*?\| Words: (\d+) \|", metastr)
        if m:
            self.story.setMetadata('numWords', m.group(1))

        # Summary: ....
        m = re.match(r".*?Summary: (.*)$", metastr)
        if m:
            self.setDescription(url, m.group(1))
            #self.story.setMetadata('description', m.group(1))

        # completed
        m = re.match(r".*?Status: Completed.*?", metastr)
        if m:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        return
コード例 #42
0
            if 'Warnings' in label:
                warnings = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class&type_id=2'))
                for warning in warnings:
                    self.story.addToList('warnings', warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata(
                    'datePublished',
                    makeDate(stripHTML(value).split(' |')[0], self.dateformat))

            if 'Updated' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a',
                          href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
コード例 #43
0
class HLFictionNetAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.username = "******" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])


        # normalized story URL.
        self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev','hlf')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%m/%d/%y"

    @staticmethod # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'hlfiction.net'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def extractChapterUrlsAndMetadata(self):

        # index=1 makes sure we see the story chapter index.  Some
        # sites skip that for one-chapter stories.
        url = self.url
        logger.debug("URL: "+url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
            raise exceptions.AccessDenied(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)
        # print data

        # Now go hunting for all the meta data and the chapter list.

        ## Title and author
        a = soup.find('div', {'id' : 'pagetitle'})

        aut = a.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
        self.story.setMetadata('authorId',aut['href'].split('=')[1])
        self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
        self.story.setMetadata('author',aut.string)
        aut.extract()

        self.story.setMetadata('title',stripHTML(a)[:(len(a.string)-3)])

        # Find the chapters:
        chapters=soup.find('select')
        if chapters != None:
            for chapter in chapters.findAll('option'):
                # just in case there's tags, like <i> in chapter titles.
                self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
        else:
            self.chapterUrls.append((self.story.getMetadata('title'),url))

        self.story.setMetadata('numChapters',len(self.chapterUrls))

        asoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl')))

        for list in asoup.findAll('div', {'class' : re.compile('listbox')}):
            a = list.find('a')
            if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
                break

        # eFiction sites don't help us out a lot with their meta data
        # formating, so it's a little ugly.

        # utility method
        def defaultGetattr(d,k):
            try:
                return d[k]
            except:
                return ""

        # <span class="label">Rated:</span> NC-17<br /> etc
        labels = list.findAll('span', {'class' : 'classification'})
        for labelspan in labels:
            label = labelspan.string
            value = labelspan.nextSibling

            if 'Summary' in label:
                ## Everything until the next span class='label'
                svalue = ""
                while 'classification' not in defaultGetattr(value,'class'):
                    svalue += unicode(value)
                    value = value.nextSibling
                self.setDescription(url,svalue)
                #self.story.setMetadata('description',stripHTML(svalue))

            if 'Rated' in label:
                self.story.setMetadata('rating', value[:len(value)-2])

            if 'Word count' in label:
                self.story.setMetadata('numWords', value)

            if 'Categories' in label:
                cats = labelspan.parent.findAll('a',href=re.compile(r'categories.php\?catid=\d+'))
                for cat in cats:
                    self.story.addToList('category',cat.string)

            if 'Characters' in label:
                for char in value.string.split(', '):
                    if not 'None' in char:
                        self.story.addToList('characters',char)

            if 'Genre' in label:
                for genre in value.string.split(', '):
                    if not 'None' in genre:
                        self.story.addToList('genre',genre)

            if 'Warnings' in label:
                for warning in value.string.split(', '):
                    if not 'None' in warning:
                        self.story.addToList('warnings',warning)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))

            if 'Updated' in label:
                # there's a stray [ at the end.
                #value = value[0:-1]
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = list.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://'+self.host+'/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
                # skip 'report this' and 'TOC' links
                if 'contact.php' not in a['href'] and 'index' not in a['href']:
                    if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
                        self.setSeries(series_name, i)
                        self.story.setMetadata('seriesUrl',series_url)
                        break
                    i+=1

        except:
            # I find it hard to care if the series parsing fails
            pass
コード例 #44
0
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'aaff')
        self.decode = ["Windows-1252",
                       "utf8"]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.
        self.is_adult = False

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # normalized story URL.
        self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid=' +
                     self.story.getMetadata('storyId'))

    @staticmethod
    def getSiteDomain():
        return 'www.adastrafanfic.com'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://" + cls.getSiteDomain() + "/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return re.escape("http://" + self.getSiteDomain() +
                         "/viewstory.php?sid=") + r"\d+$"

    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
        this and change it to True.
        '''
        return True

    def extractChapterUrlsAndMetadata(self):

        if self.is_adult or self.getConfig("is_adult"):
            addurl = "&warning=5"
        else:
            addurl = ""

        url = self.url + '&index=1' + addurl
        logger.debug("URL: " + url)

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data:
            raise exceptions.AdultCheckRequired(self.url)

        # problems with some stories, but only in calibre.  I suspect
        # issues with different SGML parsers in python.  This is a
        # nasty hack, but it works.
        data = data[data.index("<body"):]

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = bs.BeautifulSoup(data)

        ## Title
        a = soup.find('a',
                      href=re.compile(r'viewstory.php\?sid=' +
                                      self.story.getMetadata('storyId') + "$"))
        self.story.setMetadata('title', stripHTML(a))

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php"))
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata('authorUrl',
                               'http://' + self.host + '/' + a['href'])
        self.story.setMetadata('author', a.string)

        # Find the chapters:
        for chapter in soup.findAll(
                'a',
                href=re.compile(r'viewstory.php\?sid=' +
                                self.story.getMetadata('storyId') +
                                "&chapter=\d+$")):
            # just in case there's tags, like <i> in chapter titles.
            self.chapterUrls.append(
                (stripHTML(chapter),
                 'http://' + self.host + '/' + chapter['href'] + addurl))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        ## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >
        ## Summary, strangely, is in the content attr of a <meta name='description'> tag
        ## which is escaped HTML.  Unfortunately, we can't use it because they don't
        ## escape (') chars in the desc, breakin the tag.
        #meta_desc = soup.find('meta',{'name':'description'})
        #metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
        #self.story.setMetadata('description',stripHTML(metasoup))

        def defaultGetattr(d, k):
            try:
                return d[k]
            except:
                return ""

        # <span class="label">Rated:</span> NC-17<br /> etc
        labels = soup.findAll('span', {'class': 'label'})
        for labelspan in labels:
            value = labelspan.nextSibling
            label = labelspan.string

            if 'Summary' in label:
                ## Everything until the next span class='label'
                svalue = ''
                while value and not defaultGetattr(value, 'class') == 'label':
                    svalue += unicode(value)
                    value = value.nextSibling
                # sometimes poorly formated desc (<p> w/o </p>) leads
                # to all labels being included.
                svalue = svalue[:svalue.find('<span class="label">')]
                self.setDescription(url, svalue)
                #self.story.setMetadata('description',stripHTML(svalue))

            if 'Rated' in label:
                self.story.setMetadata('rating', value)

            if 'Word count' in label:
                self.story.setMetadata('numWords', value)

            if 'Categories' in label:
                cats = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=categories'))
                catstext = [cat.string for cat in cats]
                for cat in catstext:
                    self.story.addToList('category', cat.string)

            if 'Characters' in label:
                chars = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=characters'))
                charstext = [char.string for char in chars]
                for char in charstext:
                    self.story.addToList('characters', char.string)

            if 'Genre' in label:
                genres = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class&type_id=1'))
                genrestext = [genre.string for genre in genres]
                self.genre = ', '.join(genrestext)
                for genre in genrestext:
                    self.story.addToList('genre', genre.string)

            if 'Warnings' in label:
                warnings = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class&type_id=2'))
                warningstext = [warning.string for warning in warnings]
                self.warning = ', '.join(warningstext)
                for warning in warningstext:
                    self.story.addToList('warnings', warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished',
                                       makeDate(value.strip(), "%d %b %Y"))

            if 'Updated' in label:
                # there's a stray [ at the end.
                #value = value[0:-1]
                self.story.setMetadata('dateUpdated',
                                       makeDate(value.strip(), "%d %b %Y"))

        try:
            # Find Series name from series URL.
            a = soup.find('a',
                          href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
            storyas = seriessoup.findAll(
                'a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i = 1
            for a in storyas:
                if a['href'] == ('viewstory.php?sid=' +
                                 self.story.getMetadata('storyId')):
                    self.setSeries(series_name, i)
                    self.story.setMetadata('seriesUrl', series_url)
                    break
                i += 1

        except:
            # I find it hard to care if the series parsing fails
            pass
コード例 #45
0
class WhoficComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'whof')

    @staticmethod
    def getSiteDomain():
        return 'www.whofic.com'

    @classmethod
    def getSiteExampleURLs(cls):
        return "https://" + cls.getSiteDomain() + "/viewstory.php?sid=1234"

    def getSiteURLPattern(self):
        return r"https?" + re.escape("://" + self.getSiteDomain() +
                                     "/viewstory.php?sid=") + "\d+$"

    def extractChapterUrlsAndMetadata(self):

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1])

        # fetch the first chapter.  From that we will:
        # - determine title, authorname, authorid
        # - get chapter list, if not one-shot.

        url = self.url + '&chapter=1'
        logger.debug("URL: " + url)

        # use BeautifulSoup HTML parser to make everything easier to find.
        try:
            soup = self.make_soup(self._fetchUrl(url))
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        # pull title(title) and author from the HTML title.
        title = stripHTML(soup.find('title'))
        logger.debug('Title: %s' % title)
        title = title.split('::')[1].strip()
        self.story.setMetadata('title', title.split(' by ')[0].strip())
        self.story.setMetadata('author', title.split(' by ')[1].strip())

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"viewuser.php"))
        self.story.setMetadata('authorId', a['href'].split('=')[1])
        self.story.setMetadata('authorUrl',
                               'https://' + self.host + '/' + a['href'])

        # Find the chapter selector
        select = soup.find('select', {'name': 'chapter'})

        if select is None:
            # no selector found, so it's a one-chapter story.
            self.chapterUrls.append((self.story.getMetadata('title'), url))
        else:
            allOptions = select.findAll('option')
            for o in allOptions:
                url = self.url + "&chapter=%s" % o['value']
                # just in case there's tags, like <i> in chapter titles.
                title = "%s" % o
                title = re.sub(r'<[^>]+>', '', title)
                self.chapterUrls.append((title, url))

        self.story.setMetadata('numChapters', len(self.chapterUrls))

        ## Whofic.com puts none of the other meta data in the chapters
        ## or even the story chapter index page.  Need to scrape the
        ## author page to find it.

        # <table width="100%" bordercolor="#333399" border="0" cellspacing="0" cellpadding="2"><tr><td>
        # <b><a href="viewstory.php?sid=38220">Accompaniment 2</a></b> by <a href="viewuser.php?uid=12412">clandestinemiscreant</a>  [<a href="reviews.php?sid=38220">Reviews</a> - <a href="reviews.php?sid=38220">0</a>] <br>
        # This is a series of short stories written as an accompaniment to Season 2, Season 28 for us oldies, and each is unrelated except for that one factor. Each story is canon, in that it does not change established events at time of airing, based on things mentioned and/or implied and missing or deleted scenes that were not seen in the final aired episodes.<br>
        # <font size="-1"><b><a href="categories.php?catid=15">Tenth Doctor</a></b> - All Ages - None - Humor, Hurt/Comfort, Romance<br>
        # <i>Characters:</i> Rose Tyler<br>
        # <i>Series:</i> None<br>
        # <i>Published:</i> 2010.08.15 - <i>Updated:</i> 2010.08.16 - <i>Chapters:</i> 4 - <i>Completed:</i> Yes - <i>Word Count:</i> 4890 </font>
        # </td></tr></table>

        logger.debug("Author URL: " + self.story.getMetadata('authorUrl'))
        soup = self.make_soup(
            self._fetchUrl(self.story.getMetadata(
                'authorUrl')))  # normalize <br> tags to <br />
        # find this story in the list, parse it's metadata based on
        # lots of assumptions about the html, since there's little
        # tagging.
        # Found a story once that had the story URL in the desc for a
        # series on the same author's page.  Now using the reviews
        # link instead to find the appropriate metadata.
        a = soup.find('a',
                      href=re.compile(r'reviews.php\?sid=' +
                                      self.story.getMetadata('storyId')))
        metadata = a.findParent('td')
        metadatachunks = self.utf8FromSoup(
            None, metadata, allow_replace_br_with_p=False).split('<br/>')

        # Some stories have a <br/> inside the description, which
        # causes the number of metadatachunks to be 7 or 8 or 10 instead of 5.
        # so we have to process through the metadatachunks to get the description,
        # then the next metadata chunk [GComyn]

        # process metadata for this story.
        description = metadatachunks[1]
        for i, mdc in enumerate(metadatachunks):
            if i == 0 or i == 1:
                # 0 is the title section, and 1 is always the description,
                # which is already set, so skip them [GComyn]
                pass
            else:
                if not 'categories.php' in mdc:
                    description += ' // ' + mdc
                else:
                    idx = i
                    break
        moremeta = metadatachunks[idx]
        self.setDescription(url, description)

        moremeta = re.sub(r'<[^>]+>', '', moremeta)  # strip tags.

        moremetaparts = moremeta.split(' - ')

        # first part is category--whofic.com has categories
        # Doctor One-11, Torchwood, etc.  We're going to
        # prepend any with 'Doctor' or 'Era' (Multi-Era, Other
        # Era) as 'Doctor Who'.
        #
        # Also push each in as 'extra tags'.
        category = moremetaparts[0]
        if 'Doctor' in category or 'Era' in category:
            self.story.addToList('category', 'Doctor Who')

        for cat in category.split(', '):
            self.story.addToList('category', cat)

        # next in that line is age rating.
        self.story.setMetadata('rating', moremetaparts[1])

        # after that is a possible list fo specific warnings,
        # Explicit Violence, Swearing, etc
        if "None" not in moremetaparts[2]:
            for warn in moremetaparts[2].split(', '):
                self.story.addToList('warnings', warn)

        # then genre.  It's another comma list.  All together
        # in genre, plus each in extra tags.
        genre = moremetaparts[3]
        for g in genre.split(r', '):
            self.story.addToList('genre', g)

        # line 3 is characters.
        chars = metadatachunks[idx + 1]
        charsearch = "<i>Characters:</i>"
        if charsearch in chars:
            chars = chars[metadatachunks[idx + 1].index(charsearch) +
                          len(charsearch):]
            for c in chars.split(','):
                if c.strip() != u'None':
                    self.story.addToList('characters', c)

        # the next line is stuff with ' - ' separators *and* names--with tags.
        moremeta = metadatachunks[idx + 3]
        moremeta = re.sub(r'<[^>]+>', '', moremeta)  # strip tags.

        moremetaparts = moremeta.split(' - ')

        for part in moremetaparts:
            (name, value) = part.split(': ')
            name = name.strip()
            value = value.strip()
            if name == 'Published':
                self.story.setMetadata('datePublished',
                                       makeDate(value, '%Y.%m.%d'))
            if name == 'Updated':
                self.story.setMetadata('dateUpdated',
                                       makeDate(value, '%Y.%m.%d'))
            if name == 'Completed':
                if value == 'Yes':
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')
            if name == 'Word Count':
                self.story.setMetadata('numWords', value)

        # Find Series name from series URL.
        a = metadata.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
        if a != None:
            series_name = a.string
            series_url = 'https://' + self.host + '/' + a['href']
            try:
                # use BeautifulSoup HTML parser to make everything easier to find.
                seriessoup = self.make_soup(self._fetchUrl(series_url))
                storyas = seriessoup.findAll(
                    'a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
                i = 1
                for a in storyas:
                    if a['href'] == ('viewstory.php?sid=' +
                                     self.story.getMetadata('storyId')):
                        self.setSeries(series_name, i)
                        self.story.setMetadata('seriesUrl', series_url)
                        break
                    i += 1
            except:
                # I find it hard to care if the series parsing fails
                # I've changed it a little to put the series name and url in even if the page is no longer available [GComyn]
                self.setSeries(series_name, 0)
                self.story.setMetadata('seriesUrl', series_url)
コード例 #46
0
class BuffyNFaithNetAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.setHeader()

        self.decode = ["Windows-1252",
                       "utf8"]  # 1252 is a superset of iso-8859-1.
        # Most sites that claim to be
        # iso-8859-1 (and some that claim to be
        # utf8) are really windows-1252.
        self.username = "******"  # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult = False

        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(), url)
        if m:
            self.story.setMetadata('storyId', m.group('id'))

            # normalized story URL. gets rid of chapter if there, left with ch 1 URL on this site
            nurl = "http://" + self.getSiteDomain(
            ) + "/fanfictions/index.php?act=vie&id=" + self.story.getMetadata(
                'storyId')
            self._setURL(nurl)
            #argh, this mangles the ampersands I need on metadata['storyUrl']
            #will set it this way
            self.story.setMetadata('storyUrl', nurl, condremoveentities=False)
        else:
            raise exceptions.InvalidStoryURL(url, self.getSiteDomain(),
                                             self.getSiteExampleURLs())

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev', 'bnfnet')

    @staticmethod  # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'buffynfaith.net'

    @classmethod
    def stripURLParameters(cls, url):
        "Only needs to be overriden if URL contains more than one parameter"
        ## This adapter needs at least two parameters left on the URL, act and id
        return re.sub(r"(\?act=(vie|ovr)&id=\d+)&.*$", r"\1", url)

    def setHeader(self):
        "buffynfaith.net wants a Referer for images.  Used both above and below(after cookieproc added)"
        self.opener.addheaders.append(
            ('Referer', 'http://' + self.getSiteDomain() + '/'))

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://" + cls.getSiteDomain(
        ) + "/fanfictions/index.php?act=vie&id=1234 http://" + cls.getSiteDomain(
        ) + "/fanfictions/index.php?act=ovr&id=1234 http://" + cls.getSiteDomain(
        ) + "/fanfictions/index.php?act=vie&id=1234&ch=2"

    def getSiteURLPattern(self):
        #http://buffynfaith.net/fanfictions/index.php?act=vie&id=963
        #http://buffynfaith.net/fanfictions/index.php?act=vie&id=949
        #http://buffynfaith.net/fanfictions/index.php?act=vie&id=949&ch=2
        p = re.escape("http://"+self.getSiteDomain()+"/fanfictions/index.php?act=")+\
            r"(vie|ovr)&id=(?P<id>\d+)(&ch=(?P<ch>\d+))?$"
        return p

    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
        this and change it to True.
        '''
        return True

    def extractChapterUrlsAndMetadata(self):

        dateformat = "%d %B %Y"
        url = self.url
        logger.debug("URL: " + url)

        #set a cookie to get past adult check
        if self.is_adult or self.getConfig("is_adult"):
            cookie = cl.Cookie(version=0,
                               name='my_age',
                               value='yes',
                               port=None,
                               port_specified=False,
                               domain=self.getSiteDomain(),
                               domain_specified=False,
                               domain_initial_dot=False,
                               path='/',
                               path_specified=True,
                               secure=False,
                               expires=time.time() + 10000,
                               discard=False,
                               comment=None,
                               comment_url=None,
                               rest={'HttpOnly': None},
                               rfc2109=False)
            self.cookiejar.set_cookie(cookie)
            self.setHeader()

        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        #print data

        if "ADULT CONTENT WARNING" in data:
            raise exceptions.AdultCheckRequired(self.url)

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = bs.BeautifulSoup(data)

        # Now go hunting for all the meta data and the chapter list.

        #stuff in <head>: description
        svalue = soup.head.find('meta', attrs={'name':
                                               'description'})['content']
        #self.story.setMetadata('description',svalue)
        self.setDescription(url, svalue)

        #useful stuff in rest of doc, all contained in this:
        doc = soup.body.find('div', id='my_wrapper')

        #first the site category (more of a genre to me, meh) and title, in this element:
        mt = doc.find('div', attrs={'class': 'maintitle'})
        self.story.addToList('genre', mt.findAll('a')[1].string)
        self.story.setMetadata(
            'title',
            mt.findAll('a')[1].nextSibling[len('&nbsp;&raquo;&nbsp;'):])
        del mt

        #the actual category, for me, is 'Buffy: The Vampire Slayer'
        #self.story.addToList('category','Buffy: The Vampire Slayer')
        #No need to do it here, it is better to set it in in plugin-defaults.ini and defaults.ini

        #then a block that sits in a table cell like so:
        #(contains a lot of metadata)
        mblock = doc.find('td', align='left', width='70%').contents
        while len(mblock) > 0:
            i = mblock.pop(0)
            if 'Author:' in i.string:
                #drop empty space
                mblock.pop(0)
                #get author link
                a = mblock.pop(0)
                authre = re.escape(
                    './index.php?act=bio&id=') + '(?P<authid>\d+)'
                m = re.match(authre, a['href'])
                self.story.setMetadata('author', a.string)
                self.story.setMetadata('authorId', m.group('authid'))
                authurl = u'http://%s/fanfictions/index.php?act=bio&id=%s' % (
                    self.getSiteDomain(), self.story.getMetadata('authorId'))
                self.story.setMetadata('authorUrl',
                                       authurl,
                                       condremoveentities=False)
                #drop empty space
                mblock.pop(0)
            if 'Rating:' in i.string:
                self.story.setMetadata('rating', mblock.pop(0).strip())
            if 'Published:' in i.string:
                date = mblock.pop(0).strip()
                #get rid of 'st', 'nd', 'rd', 'th' after day number
                date = date[0:2] + date[4:]
                self.story.setMetadata('datePublished',
                                       makeDate(date, dateformat))
            if 'Last Updated:' in i.string:
                date = mblock.pop(0).strip()
                #get rid of 'st', 'nd', 'rd', 'th' after day number
                date = date[0:2] + date[4:]
                self.story.setMetadata('dateUpdated',
                                       makeDate(date, dateformat))
            if 'Genre:' in i.string:
                genres = mblock.pop(0).strip()
                genres = genres.split('/')
                for genre in genres:
                    self.story.addToList('genre', genre)
            #end ifs
        #end while

        # Find the chapter selector
        select = soup.find('select', {'name': 'ch'})

        if select is None:
            # no selector found, so it's a one-chapter story.
            #self.chapterUrls.append((self.story.getMetadata('title'),url))
            self.chapterUrls.append((self.story.getMetadata('title'), url))
        else:
            allOptions = select.findAll('option')
            for o in allOptions:
                url = u'http://%s/fanfictions/index.php?act=vie&id=%s&ch=%s' % (
                    self.getSiteDomain(), self.story.getMetadata('storyId'),
                    o['value'])
                title = u"%s" % o
                title = stripHTML(title)
                ts = title.split(' ', 1)
                title = ts[0] + '. ' + ts[1]
                self.chapterUrls.append((title, url))
        self.story.setMetadata('numChapters', len(self.chapterUrls))

        ## Go scrape the rest of the metadata from the author's page.
        data = self._fetchUrl(self.story.getMetadata('authorUrl'))
        soup = bs.BeautifulSoup(data)
        #find the story link and its parent div
        storya = soup.find('a', {'href': self.story.getMetadata('storyUrl')})
        storydiv = storya.parent
        #warnings come under a <spawn> tag. Never seen that before...
        #appears to just be a line of freeform text, not necessarily a list
        #optional
        spawn = storydiv.find('spawn', {'id': 'warnings'})
        if spawn is not None:
            warns = spawn.nextSibling.strip()
            self.story.addToList('warnings', warns)
        #some meta in spans - this should get all, even the ones jammed in a table
        spans = storydiv.findAll('span')
        for s in spans:
            if s.string == 'Ship:':
                list = s.nextSibling.strip().split()
                self.story.extendList('ships', list)
            if s.string == 'Characters:':
                list = s.nextSibling.strip().split(',')
                self.story.extendList('characters', list)
            if s.string == 'Status:':
                st = s.nextSibling.strip()
                self.story.setMetadata('status', st)
            if s.string == 'Words:':
                st = s.nextSibling.strip()
                self.story.setMetadata('numWords', st)

        #reviews - is this worth having?
        #ffnet adapter gathers it, don't know if anything else does
        #or if it's ever going to be used!
        a = storydiv.find('a', {'id': 'bold-blue'})
        if a:
            revs = a.nextSibling.strip()[1:-1]
            self.story.setMetadata('reviews', st)
        else:
            revs = '0'
            self.story.setMetadata('reviews', st)
コード例 #47
0
    def extractChapterUrlsAndMetadata(self):
        ''' Getting the chapter list and the meta data, plus 'is adult' checking. '''

        ## This is an adult site, so if they have not set their is_adult in the personal.ini, it will
        ## fail
        if not (self.is_adult or self.getConfig("is_adult")):
            raise exceptions.AdultCheckRequired(
                'This is an adult site. You need to be an adult to download from here.'
            )

        url = self.url
        logger.debug("URL: " + url)

        data = self.get_page(url)

        if "Latest Stories" in data:
            raise exceptions.StoryDoesNotExist(
                "The url '{0}' is not on site '{1}'".format(
                    url, self.getSiteDomain()))
        elif "The author as requested this story be removed from publication." in data:
            raise exceptions.StoryDoesNotExist(
                "{0} says: The author as requested this story be removed from publication."
                .format(self.getSiteDomain()))

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        # Now go hunting for all the meta data and the chapter list.

        ## Title
        a = unicode(soup.find('title')).replace(
            ":: GaggedUtopia's Story Archive", '').strip()
        self.story.setMetadata('title', stripHTML(a))

        # Find the chapters:
        ## This site is a 1 story/page site, so I'm setting the chapterUrls to the entered url and
        # the status to complete
        self.chapterUrls.append(('', url))
        self.story.setMetadata('numChapters', 1)
        self.story.setMetadata('status', 'Complete')

        for detail in soup.findAll('li'):
            det = str(detail).replace(b"\xc2\xa0", '')
            heading = stripHTML(det).split(' - ')[0]
            text = stripHTML(det).replace(heading + ' - ', '')
            if 'Author' in heading:
                a = detail.find('a')
                if 'mailto' in str(a):
                    self.story.setMetadata('authorId', '0000000000')
                    self.story.setMetadata('authorUrl', self.url)
                    self.story.setMetadata('author', 'Unknown')
                    self.story.setMetadata('category', 'Unknown')
                else:
                    self.story.setMetadata('authorId', a['href'].split('/')[2])
                    self.story.setMetadata('author', a.string)
                    self.story.setMetadata(
                        'authorUrl', 'http://' + self.host +
                        urllib2.quote(a['href'].encode('UTF-8')))
            elif 'Story Codes' in heading:
                self.story.setMetadata('eroticatags',
                                       text.replace('Story Codes - ', ''))
            elif 'Post Date' in heading:
                self.story.setMetadata('datePublished',
                                       makeDate(text, self.dateformat))
            elif 'Rating' in heading:
                ## this is a numerical rating for the story.
                pass
            elif 'Site Rank' in heading:
                ## This is a numerical value that shows where in the list of stories
                ## the current story is ranked
                pass
            elif 'Unique Views' in heading:
                ## This is the number of times the story has bee viewed.
                pass
            elif 'PDF Download' in heading:
                ## This is a link to download the PDF.
                pass

        ## The only way to get the category is from the author's page, but if there is no author to
        ## get, we can't set it.
        if self.story.getMetadata('author') != 'Unknown':
            adata = self.get_page(self.story.getMetadata('authorUrl'))
            asoup = self.make_soup(adata)
            storyblock = asoup.find(
                'a',
                href=re.compile(r"/code/show_story.asp/recid/" +
                                self.story.getMetadata('storyId')))
            if storyblock != None:
                td = storyblock.findNext('td')
                self.story.setMetadata('category', td.string)

        # since the 'story' is one page, I am going to save the soup here, so we can use iter
        # to get the story text in the getChapterText function, instead of having to retrieve
        # it again.
        self.html = soup
コード例 #48
0
                for genre in genres:
                    self.story.addToList('genre',genre.string)

            if 'Warnings' in label:
                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
                for warning in warnings:
                    self.story.addToList('warnings',warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))

            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://'+self.host+'/missingpieces/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
コード例 #49
0
    def extractChapterUrlsAndMetadata(self):
        ''' Getting the chapter list and the meta data, plus 'is adult' checking. '''

        ## This is an adult site, so if they have not set their is_adult in the personal.ini, it will
        ## fail
        if not (self.is_adult or self.getConfig("is_adult")):
            raise exceptions.AdultCheckRequired(
                'This is an adult site. You need to be an adult to download from here.')

        url = self.url
        logger.debug("URL: "+url)
        
        data = self.get_page(url)

        if "Latest Stories" in data:
            raise exceptions.StoryDoesNotExist("The url '{0}' is not on site '{1}'".format(
                url, self.getSiteDomain()))
        elif "The author as requested this story be removed from publication." in data:
            raise exceptions.StoryDoesNotExist(
                "{0} says: The author as requested this story be removed from publication.".format(
                    self.getSiteDomain()))

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        # Now go hunting for all the meta data and the chapter list.

        ## Title
        a = unicode(soup.find('title')).replace(":: GaggedUtopia's Story Archive",'').strip()
        self.story.setMetadata('title',stripHTML(a))

        # Find the chapters:
        ## This site is a 1 story/page site, so I'm setting the chapterUrls to the entered url and
        # the status to complete
        self.chapterUrls.append(('',url))
        self.story.setMetadata('numChapters',1)
        self.story.setMetadata('status', 'Completed')


        for detail in soup.findAll('li'):
            det = str(detail).replace(b"\xc2\xa0",'')
            heading = stripHTML(det).split(' - ')[0]
            text = stripHTML(det).replace(heading+' - ','')
            if 'Author' in heading:
                a = detail.find('a')
                if 'mailto' in str(a):
                    self.story.setMetadata('authorId','0000000000')
                    self.story.setMetadata('authorUrl',self.url)
                    self.story.setMetadata('author','Unknown')
                    self.story.setMetadata('category','Unknown')
                else:
                    self.story.setMetadata('authorId',a['href'].split('/')[2])
                    self.story.setMetadata('author',a.string)
                    self.story.setMetadata('authorUrl','http://'+self.host+urllib2.quote(
                        a['href'].encode('UTF-8')))
            elif 'Story Codes' in heading:
                self.story.setMetadata('eroticatags',text.replace('Story Codes - ',''))
            elif 'Post Date' in heading:
                self.story.setMetadata('datePublished', makeDate(text, self.dateformat))
            elif 'Rating' in heading:
                ## this is a numerical rating for the story. 
                pass
            elif 'Site Rank' in heading:
                ## This is a numerical value that shows where in the list of stories
                ## the current story is ranked
                pass
            elif 'Unique Views' in heading:
                ## This is the number of times the story has bee viewed. 
                pass
            elif 'PDF Download' in heading:
                ## This is a link to download the PDF. 
                pass

        ## The only way to get the category is from the author's page, but if there is no author to
        ## get, we can't set it.
        if self.story.getMetadata('author') != 'Unknown':
            adata = self.get_page(self.story.getMetadata('authorUrl'))
            asoup = self.make_soup(adata)
            storyblock = asoup.find('a',href=re.compile(r"/code/show_story.asp/recid/"+
                self.story.getMetadata('storyId')))
            if storyblock != None:
                td = storyblock.findNext('td')
                self.story.setMetadata('category',td.string)
            
        # since the 'story' is one page, I am going to save the soup here, so we can use iter
        # to get the story text in the getChapterText function, instead of having to retrieve
        # it again.
        self.html = soup
コード例 #50
0
class FictionHuntComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev', 'fichunt')

        # get storyId from url--url validation guarantees second part is storyId
        self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[2])

        # normalized story URL.
        self._setURL("http://"+self.getSiteDomain()\
                         +"/read/"+self.story.getMetadata('storyId')+"/1")

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%d-%m-%Y"

    @staticmethod
    def getSiteDomain():
        return 'fictionhunt.com'

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://fictionhunt.com/read/1234/1"

    def getSiteURLPattern(self):
        return r"http://(www.)?fictionhunt.com/read/\d+(/\d+)?(/|/[^/]+)?/?$"

    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
        this and change it to True.
        '''
        return True

    def doExtractChapterUrlsAndMetadata(self, get_cover=True):

        # fetch the chapter.  From that we will get almost all the
        # metadata and chapter list

        url = self.url
        try:
            data = self._fetchUrl(url)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.meta)
            else:
                raise e

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        self.story.setMetadata(
            'title',
            stripHTML(soup.find('div', {'class': 'title'})).strip())

        self.setDescription(
            url,
            '<i>(Story descriptions not available on fictionhunt.com)</i>')

        # Find authorid and URL from... author url.
        # fictionhunt doesn't have author pages, use ffnet original author link.
        a = soup.find('a', href=re.compile(r"fanfiction.net/u/\d+"))
        self.story.setMetadata('authorId', a['href'].split('/')[-1])
        self.story.setMetadata(
            'authorUrl', 'https://www.fanfiction.net/u/' +
            self.story.getMetadata('authorId'))
        self.story.setMetadata('author', a.string)

        # Find original ffnet URL
        a = soup.find('a', href=re.compile(r"fanfiction.net/s/\d+"))
        self.story.setMetadata('origin', stripHTML(a))
        self.story.setMetadata('originUrl', a['href'])

        # Fleur D. & Harry P. & Hermione G. & Susan B. - Words: 42,848 - Rated: M - English - None - Chapters: 9 - Reviews: 248 - Updated: 21-09-2016 - Published: 16-05-2015 - by Elven Sorcerer (FFN)
        # None - Words: 13,087 - Rated: M - English - Romance & Supernatural - Chapters: 3 - Reviews: 5 - Updated: 21-09-2016 - Published: 20-09-2016
        # Harry P. & OC - Words: 10,910 - Rated: M - English - None - Chapters: 5 - Reviews: 6 - Updated: 21-09-2016 - Published: 11-09-2016
        # Dudley D. & Harry P. & Nagini & Vernon D. - Words: 4,328 - Rated: K+ - English - None - Chapters: 2 - Updated: 21-09-2016 - Published: 20-09-2016 -
        details = soup.find('div', {'class': 'details'})

        detail_re = \
            r'(?P<characters>.+) - Words: (?P<numWords>[0-9,]+) - Rated: (?P<rating>[a-zA-Z\\+]+) - (?P<language>.+) - (?P<genre>.+)'+ \
            r' - Chapters: (?P<numChapters>[0-9,]+)( - Reviews: (?P<reviews>[0-9,]+))? - Updated: (?P<dateUpdated>[0-9-]+)'+ \
            r' - Published: (?P<datePublished>[0-9-]+)(?P<completed> - Complete)?'

        details_dict = re.match(detail_re, stripHTML(details)).groupdict()

        # lists
        for meta in ('characters', 'genre'):
            if details_dict[meta] != 'None':
                self.story.extendList(meta, details_dict[meta].split(' & '))

        # scalars
        for meta in ('numWords', 'numChapters', 'rating', 'language',
                     'reviews'):
            self.story.setMetadata(meta, details_dict[meta])

        # dates
        for meta in ('datePublished', 'dateUpdated'):
            self.story.setMetadata(
                meta, makeDate(details_dict[meta], self.dateformat))

        # status
        if details_dict['completed']:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        # It's assumed that the number of chapters is correct.
        # There's no complete list of chapters, so the only
        # alternative is to get the num of chaps from the last
        # indiated chapter list instead.
        for i in range(1, 1 + int(self.story.getMetadata('numChapters'))):
            self.chapterUrls.append(("Chapter "+unicode(i),"http://"+self.getSiteDomain()\
                                         +"/read/"+self.story.getMetadata('storyId')+"/%s"%i))
コード例 #51
0
    def extractChapterUrlsAndMetadata(self):
        soup = self._customized_fetch_url(self.url + '&i=1')

        if soup.title.string.encode(_SOURCE_CODE_ENCODING).strip(' :') == 'írta':
            raise exceptions.StoryDoesNotExist(self.url)

        chapter_options = soup.find('form', action='viewstory.php').select('option')
        # Remove redundant "Fejezetek" option
        chapter_options.pop(0)

        # If there is still more than one entry remove chapter overview entry
        if len(chapter_options) > 1:
            chapter_options.pop(0)

        for option in chapter_options:
            url = urlparse.urljoin(self.url, option['value'])
            self.chapterUrls.append((option.string, url))

        author_url = urlparse.urljoin(self.BASE_URL, soup.find('a', href=lambda href: href and href.startswith('viewuser.php?uid='))['href'])
        soup = self._customized_fetch_url(author_url)

        story_id = self.story.getMetadata('storyId')
        for table in soup('table', {'class': 'mainnav'}):
            title_anchor = table.find('span', {'class': 'storytitle'}).a
            href = title_anchor['href']
            if href.startswith('javascript:'):
                href = href.rsplit(' ', 1)[1].strip("'")
            query_data = _get_query_data(href)

            if query_data['sid'] == story_id:
                break
        else:
            # This should never happen, the story must be found on the author's
            # page.
            raise exceptions.FailedToDownload(self.url)

        self.story.setMetadata('title', title_anchor.string)

        rows = table('tr')

        anchors = rows[0].div('a')
        author_anchor = anchors[1]
        query_data = _get_query_data(author_anchor['href'])
        self.story.setMetadata('author', author_anchor.string)
        self.story.setMetadata('authorId', query_data['uid'])
        self.story.setMetadata('authorUrl', urlparse.urljoin(self.BASE_URL, author_anchor['href']))
        self.story.setMetadata('reviews', anchors[3].string)

        if self.getConfig('keep_summary_html'):
            self.story.setMetadata('description', self.utf8FromSoup(author_url, rows[1].td))
        else:
            self.story.setMetadata('description', ''.join(rows[1].td(text=True)))

        for row in rows[3:]:
            index = 0
            cells = row('td')

            while index < len(cells):
                cell = cells[index]
                key = cell.b.string.encode(_SOURCE_CODE_ENCODING).strip(':')
                try:
                    value = cells[index+1].string.encode(_SOURCE_CODE_ENCODING)
                except AttributeError:
                    value = None

                if key == 'Kategória':
                    for anchor in cells[index+1]('a'):
                        self.story.addToList('category', anchor.string)

                elif key == 'Szereplõk':
                    if cells[index+1].string:
                        for name in cells[index+1].string.split(', '):
                            self.story.addToList('character', name)

                elif key == 'Korhatár':
                    if value != 'nem korhatáros':
                        self.story.setMetadata('rating', value)

                elif key == 'Figyelmeztetések':
                    for b_tag in cells[index+1]('b'):
                        self.story.addToList('warnings', b_tag.string)

                elif key == 'Jellemzõk':
                    for genre in cells[index+1].string.split(', '):
                        self.story.addToList('genre', genre)

                elif key == 'Fejezetek':
                    self.story.setMetadata('numChapters', int(value))

                elif key == 'Megjelenés':
                    self.story.setMetadata('datePublished', makeDate(value, self.DATE_FORMAT))

                elif key == 'Frissítés':
                    self.story.setMetadata('dateUpdated', makeDate(value, self.DATE_FORMAT))

                elif key == 'Szavak':
                    self.story.setMetadata('numWords', value)

                elif key == 'Befejezett':
                    self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')

                index += 2

        if self.story.getMetadata('rating') == '18':
            if not (self.is_adult or self.getConfig('is_adult')):
                raise exceptions.AdultCheckRequired(self.url)
コード例 #52
0
    def extractChapterUrlsAndMetadata(self):

        url = self.url
        logger.debug("URL: " + url)

        data = self.get_page(url)

        # Since this is a site with the entire story on one page and there are no updates, I'm going
        # to set the status to complete.
        self.story.setMetadata('status', 'Completed')

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)

        # Title
        ## Some stories do not have the title in a tag that can be easily gotten.
        title = soup.find('h2')
        if not title:
            raise exceptions.StoryDoesNotExist(
                'Cannot find title on the page {}'.format(url))

        self.story.setMetadata('title', stripHTML(title))

        # This site has the entire story on one page, so we will be using the normalized URL as
        # the chapterUrl and the Title as the chapter Title
        self.add_chapter(self.story.getMetadata('title'), url)

        ## i would take this out, as it is not really needed, but the calibre plugin uses it,
        ## so it's staying
        self.story.setMetadata('numChapters', 1)

        # Find authorid and URL
        ## this site does not have dedicated pages for the authors, you have to use the searh
        ## engine. so that is what I will do.
        mdata = stripHTML(soup.find('h2').find_next('a'))
        if not mdata:
            mdata = stripHTML(soup.find('a', href=re.compile('mailto')))
        elif '@' in mdata:
            mdata = mdata.split('@')[0]
        self.story.setMetadata('authorId', mdata)
        self.story.setMetadata('author', mdata.title())

        # Some stories list multiple authors, but the search engine only uses 1 author, and since
        # we can't tell how many 'words' are in each name, I'm going to do a work around.
        author_name = mdata.split('  ')[0].strip()
        author_url = ('http://' + self.getSiteDomain() +
                      '/cgi-bin/search.cgi?Author={}&SortBy=0' +
                      '&SortOrder=0&NumToList=0&FastSearch=0&ShortResults=0'
                      ).format(author_name)
        story_found = False
        while not story_found:
            asoup = self.make_soup(self.get_page(author_url))
            # Ok...this site does not have the stories encompassed by any sort of tag... so I have
            # to make it.
            stories_main = asoup.find('table', {'class': 'content'}).find('td')
            if stories_main:
                if len(repr(stories_main).split('<b>', 1)) == 1:
                    author_name = ' '.join(author_name.split()[:-1])
                    author_url = ('http://' + self.getSiteDomain() +
                                  '/cgi-bin/search.cgi?Author={}&SortBy=0' +
                                  '&SortOrder=0&NumToList=0&FastSearch=0' +
                                  '&ShortResults=0').format(author_name)
                    pass
                else:
                    stories_main = u'<b>' + repr(stories_main).split('<b>',
                                                                     1)[1][:-5]
                    ## now that I have the stories in a format that I can manipulate, I'm going to
                    # split them up. The last 2 elements are not stories, so I a going to drop them.
                    stories = stories_main.replace('\\n', '').split('<p>')[:-2]
                    for story in stories:
                        ## now I am going to turn this string back into a bs tag, removing the <b>
                        # tags for easier manipulation
                        story = '<div>' + story.replace('<b>', '').replace(
                            '</b>', '') + '</div>'
                        story = self.make_soup(story).find('div')
                        story_a = story.find('a')
                        ## some stories have special characters... need to fix them.
                        title = repr(
                            self.story.getMetadata('title'))[2:-1].replace(
                                '&amp;', '&')
                        if title in story_a.get_text():
                            story_found = True
                            break
                    if not story_found:
                        raise exceptions.StoryDoesNotExist(
                            "Could not find the story {} on the author's {} search page {}"
                            .format(url, author_name, author_url))

        self.story.setMetadata('authorUrl', author_url)

        # The first element is the author, which we already have, so I'm going to drop it.
        mdatas = story.find_all('br')[1:]
        for mdata in mdatas:
            meta = mdata.nextSibling.string

            if meta:
                # This site doesn't seem to have any url links within the story listing (except for
                # the author and title, which we've already gotten), so I don't have to worry about
                # that.
                label = meta.split(':', 2)[0].strip().lower()
                value = meta[len(label) + 1:].strip()
                if label == 'show':
                    # This site uses the show label for the category (as used on ffnet)
                    self.story.setMetadata('category', value)
                elif label == 'rating':
                    self.story.setMetadata('rating', value)
                elif label == 'category':
                    # This site uses the category for the genre (as used on ffnet)
                    self.story.setMetadata('genre', value)
                elif label == 'characters':
                    self.story.setMetadata('characters', value)
                elif label == 'pairings':
                    self.story.setMetadata('ships', value)
                elif label == 'summary':
                    self.setDescription(url, value)
                elif label == 'warnings':
                    self.story.setMetadata('warnings', value)
                elif label == 'archived on':
                    self.story.setMetadata('datePublished',
                                           makeDate(value, self.dateformat))
                else:
                    #There shouldn't be any other labels, but I'm putting this here to catch
                    # anything that might be missed
                    logger.debug('Missed metadata: %s' % meta)
            else:
                # there should always be something, but just in case, I'm going to print it out in
                # the debugger
                logger.debug('Missed metadata: %s' % mdata)

        # since this is the only "chapter" that will be retrieved, I'm going to save the soup here
        # so the getChapterText function doesn't have to use more bandwidth to get it again
        self.html = soup
コード例 #53
0
            self.story.setMetadata('rating', m.group(1))

        m = re.match(r".*?Genres: (.+?) -.*?",metastr)
        if m:
            for g in m.group(1).split(','):
                self.story.addToList('genre',g)
        
        m = re.match(r".*?Characters: (.*?) -.*?",metastr)
        if m:
            for g in m.group(1).split(','):
                if g:
                    self.story.addToList('characters',g)
        
        m = re.match(r".*?Published: ([0-9-]+?) -.*?",metastr)
        if m:
            self.story.setMetadata('datePublished',makeDate(m.group(1), "%Y-%m-%d"))

        # Updated can have more than one space after it. <shrug>
        m = re.match(r".*?Updated: ([0-9-]+?) +-.*?",metastr) 
        if m:
            self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%Y-%m-%d"))

        m = re.match(r".*? - ([0-9,]+?) words.*?",metastr)
        if m:
            self.story.setMetadata('numWords',m.group(1))

        if metastr.endswith("Complete"):
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')
コード例 #54
0
            if 'Warnings' in label:
                warnings = labelspan.parent.findAll(
                    'a', href=re.compile(r'browse.php\?type=class&type_id=3'))
                for warning in warnings:
                    self.story.addToList('warnings', warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata(
                    'datePublished', makeDate(stripHTML(value),
                                              self.dateformat))

            if 'Updated' in label:
                # there's a stray [ at the end.
                #value = value[0:-1]
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find(
                'a',
                href=re.compile(r"fanfiction/viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://' + self.host + '/' + a['href']
コード例 #55
0
            if 'Warnings' in label:
                warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=11'))
                warningstext = [warning.string for warning in warnings]
                self.warning = ', '.join(warningstext)
                for warning in warningstext:
                    self.story.addToList('warnings',warning.string)

            if 'Completed' in label:
                if 'Yes' in value:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata('datePublished', makeDate(value.strip(), "%d %b %Y"))

            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%d %b %Y"))

        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'https://'+self.host+'/efiction/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i=1
            for a in storyas:
コード例 #56
0
class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.username = "******"  # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult = False

        self.full_work_soup = None

        # get storyId from url--url validation guarantees query is only sid=1234
        self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[2])

        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(), url)
        if m:
            self.story.setMetadata('storyId', m.group('id'))

            # normalized story URL.
            self._setURL('https://' + self.getSiteDomain() + '/works/' +
                         self.story.getMetadata('storyId'))
        else:
            raise exceptions.InvalidStoryURL(url, self.getSiteDomain(),
                                             self.getSiteExampleURLs())

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev', 'ao3')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%Y-%b-%d"

    @staticmethod  # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'archiveofourown.org'

    @classmethod
    def getAcceptDomains(cls):
        return ['archiveofourown.org', 'download.archiveofourown.org']

    @classmethod
    def getSiteExampleURLs(cls):
        return "https://" + cls.getSiteDomain(
        ) + "/works/123456 https://" + cls.getSiteDomain(
        ) + "/collections/Some_Archive/works/123456 https://" + cls.getSiteDomain(
        ) + "/works/123456/chapters/78901"

    def getSiteURLPattern(self):
        # https://archiveofourown.org/collections/Smallville_Slash_Archive/works/159770
        # Discard leading zeros from story ID numbers--AO3 doesn't use them in it's own chapter URLs.
        return r"https?://(download\.)?" + re.escape(self.getSiteDomain(
        )) + r"(/collections/[^/]+)?/works/0*(?P<id>\d+)"

    ## Login
    def needToLoginCheck(self, data):
        if 'This work is only available to registered users of the Archive.' in data \
                or "The password or user name you entered doesn't match our records" in data:
            return True
        else:
            return False

    def performLogin(self, url, data):

        params = {}
        if self.password:
            params['user_session[login]'] = self.username
            params['user_session[password]'] = self.password
        else:
            params['user_session[login]'] = self.getConfig("username")
            params['user_session[password]'] = self.getConfig("password")
        params['user_session[remember_me]'] = '1'
        params['commit'] = 'Log in'
        #params['utf8'] = u'✓'#u'\x2713' # gets along with out it, and it confuses the encoder.
        params['authenticity_token'] = data.split(
            'input name="authenticity_token" type="hidden" value="')[1].split(
                '"')[0]

        loginUrl = 'https://' + self.getSiteDomain() + '/user_sessions'
        logger.info("Will now login to URL (%s) as (%s)" %
                    (loginUrl, params['user_session[login]']))

        d = self._postUrl(loginUrl, params)
        #logger.info(d)

        if "Successfully logged in" not in d:  #Member Account
            logger.info("Failed to login to URL %s as %s" %
                        (loginUrl, params['user_session[login]']))
            raise exceptions.FailedToLogin(url, params['user_session[login]'])
            return False
        else:
            return True

    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
        this and change it to True.
        '''
        return True

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def extractChapterUrlsAndMetadata(self):

        if self.is_adult or self.getConfig("is_adult"):
            addurl = "?view_adult=true"
        else:
            addurl = ""

        metaurl = self.url + addurl
        url = self.url + '/navigate' + addurl
        logger.info("url: " + url)
        logger.info("metaurl: " + metaurl)

        try:
            data = self._fetchUrl(url)
            meta = self._fetchUrl(metaurl)

            if "This work could have adult content. If you proceed you have agreed that you are willing to see such content." in meta:
                raise exceptions.AdultCheckRequired(self.url)

        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if "Sorry, we couldn&#x27;t find the work you were looking for." in data:
            raise exceptions.StoryDoesNotExist(self.url)

        # need to log in for this one, or always_login.
        if self.needToLoginCheck(data) or \
                ( self.getConfig("always_login") and 'input name="authenticity_token"' in data ):
            ## except don't log in if already logged in (cached
            ## responses in calibre job).  already logged in if
            ## there's no authenticity_token in data.
            self.performLogin(url, data)
            data = self._fetchUrl(url, usecache=False)
            meta = self._fetchUrl(metaurl, usecache=False)

        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = self.make_soup(data)
        for tag in soup.findAll('div', id='admin-banner'):
            tag.extract()
        metasoup = self.make_soup(meta)
        for tag in metasoup.findAll('div', id='admin-banner'):
            tag.extract()

        # Now go hunting for all the meta data and the chapter list.

        ## Title
        a = soup.find('a', href=re.compile(r"/works/\d+$"))
        self.story.setMetadata('title', stripHTML(a))

        if self.getConfig("always_login"):
            try:
                self.story.extendList(
                    'bookmarktags',
                    metasoup.find(
                        'input',
                        id='bookmark_tag_string')['value'].split(', '))
            except KeyError:
                pass
            self.story.setMetadata(
                'bookmarksummary',
                stripHTML(metasoup.find('textarea', id='bookmark_notes')))

        # Find authorid and URL from... author url.
        alist = soup.findAll('a', href=re.compile(r"/users/\w+/pseuds/\w+"))
        if len(alist
               ) < 1:  # ao3 allows for author 'Anonymous' with no author link.
            self.story.setMetadata('author', 'Anonymous')
            self.story.setMetadata('authorUrl', 'https://archiveofourown.org/')
            self.story.setMetadata('authorId', '0')
        else:
            for a in alist:
                self.story.addToList('authorId', a['href'].split('/')[-1])
                self.story.addToList('authorUrl',
                                     'https://' + self.host + a['href'])
                self.story.addToList('author', a.text)

        byline = metasoup.find('h3', {'class': 'byline'})
        if byline:
            self.story.setMetadata('byline', stripHTML(byline))

        # byline:
        # <h3 class="byline heading">
        # Hope Roy [archived by <a href="/users/ssa_archivist/pseuds/ssa_archivist" rel="author">ssa_archivist</a>]
        # </h3>
        # stripped:"Hope Roy [archived by ssa_archivist]"

        m = re.match(r'(?P<author>.*) \[archived by (?P<archivist>.*)\]',
                     stripHTML(byline))
        if (m and len(alist) == 1 and self.getConfig('use_archived_author')):
            self.story.setMetadata('author', m.group('author'))

        newestChapter = None
        self.newestChapterNum = None  # save for comparing during update.
        # Scan all chapters to find the oldest and newest, on AO3 it's
        # possible for authors to insert new chapters out-of-order or
        # change the dates of earlier ones by editing them--That WILL
        # break epub update.
        # Find the chapters:
        chapters = soup.findAll(
            'a',
            href=re.compile(r'/works/' + self.story.getMetadata('storyId') +
                            "/chapters/\d+$"))
        self.story.setMetadata('numChapters', len(chapters))
        logger.debug("numChapters: (%s)" %
                     self.story.getMetadata('numChapters'))
        if len(chapters) == 1:
            self.chapterUrls.append(
                (self.story.getMetadata('title'),
                 'https://' + self.host + chapters[0]['href']))
        else:
            for index, chapter in enumerate(chapters):
                # strip just in case there's tags, like <i> in chapter titles.
                self.chapterUrls.append(
                    (stripHTML(chapter),
                     'https://' + self.host + chapter['href']))
                # (2013-09-21)
                date = stripHTML(chapter.findNext('span'))[1:-1]
                chapterDate = makeDate(date, self.dateformat)
                if newestChapter == None or chapterDate > newestChapter:
                    newestChapter = chapterDate
                    self.newestChapterNum = index

        a = metasoup.find('blockquote', {'class': 'userstuff'})
        if a != None:
            self.setDescription(url, a)
            #self.story.setMetadata('description',a.text)

        a = metasoup.find('dd', {'class': "rating tags"})
        if a != None:
            self.story.setMetadata('rating', stripHTML(a.text))

        d = metasoup.find('dd', {'class': "language"})
        if d != None:
            self.story.setMetadata('language', stripHTML(d.text))

        a = metasoup.find('dd', {'class': "fandom tags"})
        fandoms = a.findAll('a', {'class': "tag"})
        for fandom in fandoms:
            self.story.addToList('fandoms', fandom.string)

        a = metasoup.find('dd', {'class': "warning tags"})
        if a != None:
            warnings = a.findAll('a', {'class': "tag"})
            for warning in warnings:
                self.story.addToList('warnings', warning.string)

        a = metasoup.find('dd', {'class': "freeform tags"})
        if a != None:
            genres = a.findAll('a', {'class': "tag"})
            for genre in genres:
                self.story.addToList('freeformtags', genre.string)

        a = metasoup.find('dd', {'class': "category tags"})
        if a != None:
            genres = a.findAll('a', {'class': "tag"})
            for genre in genres:
                if genre != "Gen":
                    self.story.addToList('ao3categories', genre.string)

        a = metasoup.find('dd', {'class': "character tags"})
        if a != None:
            chars = a.findAll('a', {'class': "tag"})
            for char in chars:
                self.story.addToList('characters', char.string)

        a = metasoup.find('dd', {'class': "relationship tags"})
        if a != None:
            ships = a.findAll('a', {'class': "tag"})
            for ship in ships:
                self.story.addToList('ships', ship.string)

        a = metasoup.find('dd', {'class': "collections"})
        if a != None:
            collections = a.findAll('a')
            for collection in collections:
                self.story.addToList('collections', collection.string)

        stats = metasoup.find('dl', {'class': 'stats'})
        dt = stats.findAll('dt')
        dd = stats.findAll('dd')
        for x in range(0, len(dt)):
            label = dt[x].text
            value = dd[x].text

            if 'Words:' in label:
                self.story.setMetadata('numWords', value)

            if 'Comments:' in label:
                self.story.setMetadata('comments', value)

            if 'Kudos:' in label:
                self.story.setMetadata('kudos', value)

            if 'Hits:' in label:
                self.story.setMetadata('hits', value)

            if 'Bookmarks:' in label:
                self.story.setMetadata('bookmarks', value)

            if 'Chapters:' in label:
                if value.split('/')[0] == value.split('/')[1]:
                    self.story.setMetadata('status', 'Completed')
                else:
                    self.story.setMetadata('status', 'In-Progress')

            if 'Published' in label:
                self.story.setMetadata(
                    'datePublished', makeDate(stripHTML(value),
                                              self.dateformat))
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

            if 'Updated' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

            if 'Completed' in label:
                self.story.setMetadata(
                    'dateUpdated', makeDate(stripHTML(value), self.dateformat))

        # Find Series name from series URL.
        ddseries = metasoup.find('dd', {'class': "series"})

        if ddseries:
            for i, a in enumerate(
                    ddseries.findAll('a', href=re.compile(r"/series/\d+"))):
                series_name = stripHTML(a)
                series_url = 'https://' + self.host + a['href']
                series_index = int(
                    stripHTML(a.previousSibling).replace(
                        ', ', '').split(' ')[1])  # "Part # of" or ", Part #"
                self.story.setMetadata('series%02d' % i,
                                       "%s [%s]" % (series_name, series_index))
                self.story.setMetadata('series%02dUrl' % i, series_url)
                if i == 0:
                    self.setSeries(series_name, series_index)
                    self.story.setMetadata('seriesUrl', series_url)
コード例 #57
0
        metatext = stripHTML(smalldiv)

        if 'Completed: Yes' in metatext:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        wordstart=metatext.rindex('Word count:')+12
        words = metatext[wordstart:metatext.index(' ',wordstart)]
        self.story.setMetadata('numWords', words)

        datesdiv = soup.find('div',{'class':'bottom'})
        dates = stripHTML(datesdiv).split()
        # Published: 04/26/2011 Updated: 03/06/2013
        self.story.setMetadata('datePublished', makeDate(dates[1], self.dateformat))
        self.story.setMetadata('dateUpdated', makeDate(dates[3], self.dateformat))

        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
            series_name = a.string
            series_url = 'http://'+self.host+'/'+a['href']

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = self.make_soup(self._fetchUrl(series_url))
            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
                # skip 'report this' and 'TOC' links