def get_urls_from_text(data, configuration=None, normalize=False): urls = collections.OrderedDict() data = unicode(data) if not configuration: configuration = Configuration("test1.com", "EPUB") for href in re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data): # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in href: m = re.search( r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)", href) if m != None: href = form_url(href, m.group('sid')) try: href = href.replace('&index=1', '') adapter = adapters.getAdapter(configuration, href) if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except: pass # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized return urls.keys() if normalize else [ max(value, key=len) for key, value in urls.items() ]
def get_urls_from_text(data, configuration=None, normalize=False): normalized = [] # normalized url retlist = [] # orig urls. data = unicode(data) if not configuration: configuration = Configuration("test1.com", "EPUB") for href in re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data): # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. # The 'normalized' set prevents duplicates. if 'story.php' in href: m = re.search( r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)", href) if m != None: href = form_url(href, m.group('sid')) try: href = href.replace('&index=1', '') adapter = adapters.getAdapter(configuration, href) if adapter.story.getMetadata('storyUrl') not in normalized: normalized.append(adapter.story.getMetadata('storyUrl')) retlist.append(href) except: pass if normalize: return normalized else: return retlist
def get_urls_from_html(data, url=None, configuration=None, normalize=False, restrictsearch=None, email=False): urls = collections.OrderedDict() if not configuration: configuration = Configuration(["test1.com"], "EPUB", lightweight=True) ## soup and re-soup because BS4/html5lib is more forgiving of ## incorrectly nested tags that way. soup = BeautifulSoup(unicode(BeautifulSoup(data, "html5lib")), "html5lib") if restrictsearch: soup = soup.find(*restrictsearch) #logger.debug("restrict search:%s"%soup) for a in soup.findAll('a'): if a.has_attr('href'): #logger.debug("a['href']:%s"%a['href']) href = form_url(url, a['href']) #logger.debug("1 urlhref:%s"%href) href = cleanup_url(href, email) try: #logger.debug("2 urlhref:%s"%href) adapter = adapters.getAdapter(configuration, href) #logger.debug("found adapter") if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except Exception, e: #logger.debug e pass
def get_urls_from_text(data, configuration=None, normalize=False, email=False): urls = collections.OrderedDict() try: data = unicode(data) except UnicodeDecodeError: data = data.decode('utf8') ## for when called outside calibre. if not configuration: configuration = Configuration(["test1.com"], "EPUB", lightweight=True) for href in re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data): href = cleanup_url(href, email) try: adapter = adapters.getAdapter(configuration, href) if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except: pass # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized return urls.keys() if normalize else [ max(value, key=len) for key, value in urls.items() ]
def get_urls_from_page(url, configuration=None, normalize=False): if not configuration: configuration = Configuration(["test1.com"], "EPUB", lightweight=True) data = None adapter = None try: adapter = adapters.getAdapter(configuration, url, anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl = "" # just to get an authenticity_token. data = adapter._fetchUrl(url + addurl) # login the session. adapter.performLogin(url, data) # get the list page with logged in session. if 'fimfiction.net' in url and adapter.getConfig("is_adult"): data = adapter._fetchUrl(url) adapter.set_adult_cookie() if 'tthfanfic.org' in url and adapter.getConfig("is_adult"): ## Simple fetch works in testing, but actual pages use a ## POST and has a 'ctkn' value, so we do too. # adapter._fetchUrl("https://www.tthfanfic.org/setmaxrating.php?sitemaxrating=5") adapter.setSiteMaxRating(url) # this way it uses User-Agent or other special settings. data = adapter._fetchUrl(url, usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch = None if 'scarvesandcoffee.net' in url: restrictsearch = ('div', {'id': 'mainpage'}) return get_urls_from_html(data, url, configuration, normalize, restrictsearch)
def get_urls_from_html(data, url=None, configuration=None, normalize=False, restrictsearch=None): urls = collections.OrderedDict() if not configuration: configuration = Configuration(["test1.com"], "EPUB", lightweight=True) ## soup and re-soup because BS4/html5lib is more forgiving of ## incorrectly nested tags that way. soup = BeautifulSoup(unicode(BeautifulSoup(data, "html5lib")), "html5lib") if restrictsearch: soup = soup.find(*restrictsearch) #logger.debug("restrict search:%s"%soup) for a in soup.findAll('a'): if a.has_attr('href'): #logger.debug("a['href']:%s"%a['href']) href = form_url(url, a['href']) #logger.debug("1 urlhref:%s"%href) # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in a['href']: #logger.debug("trying:%s"%a['href']) m = re.search( r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)", a['href']) if m != None: href = form_url(a['href'] if '//' in a['href'] else url, m.group('sid')) try: href = href.replace('&index=1', '') #logger.debug("2 urlhref:%s"%href) adapter = adapters.getAdapter(configuration, href) #logger.debug("found adapter") if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except Exception, e: #logger.debug e pass
def get_urls_from_html(data, url=None, configuration=None, normalize=False, restrictsearch=None): normalized = [] # normalized url retlist = [] # orig urls. if not configuration: configuration = Configuration("test1.com", "EPUB") soup = BeautifulSoup(data) if restrictsearch: soup = soup.find(*restrictsearch) #print("restrict search:%s"%soup) for a in soup.findAll('a'): if a.has_key('href'): #print("a['href']:%s"%a['href']) href = form_url(url, a['href']) #print("1 urlhref:%s"%href) # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. # The 'normalized' set prevents duplicates. if 'story.php' in a['href']: #print("trying:%s"%a['href']) m = re.search( r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)", a['href']) if m != None: href = form_url(a['href'] if '//' in a['href'] else url, m.group('sid')) try: href = href.replace('&index=1', '') #print("2 urlhref:%s"%href) adapter = adapters.getAdapter(configuration, href) #print("found adapter") if adapter.story.getMetadata('storyUrl') not in normalized: normalized.append(adapter.story.getMetadata('storyUrl')) retlist.append(href) except Exception, e: #print e pass
def get_urls_from_page(url, configuration=None, normalize=False): if not configuration: configuration = Configuration("test1.com", "EPUB") data = None adapter = None try: adapter = adapters.getAdapter(configuration, url, anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl = "" # just to get an authenticity_token. data = adapter._fetchUrl(url + addurl) # login the session. adapter.performLogin(url, data) # get the list page with logged in session. # this way it uses User-Agent or other special settings. Only AO3 # is doing login. data = adapter._fetchUrl(url, usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch = None if 'scarvesandcoffee.net' in url: restrictsearch = ('div', {'id': 'mainpage'}) return get_urls_from_html(data, url, configuration, normalize, restrictsearch)