def process_feeds(tuples, user): """ Take tuples from detect_feeds_in_HTML() and create an outline entry for opml """ for t in tuples: html = parser(user.url, convertEntities=parser.HTML_ENTITIES).contents[0] # Ignore feeds for comments if "comment" in t: next # Ignore annoying typo in the html from a friend if "\"" in t: next # Convert relative URLs if "http" in t: xml = parser(t, convertEntities=parser.HTML_ENTITIES).contents[0] else: myxml = html + t xml = parser(myxml, convertEntities=parser.HTML_ENTITIES).contents[0] # If we've got something, rather than nothing, return a hash if xml: return {'title': user.name, 'html_url': html, 'xml_url': xml} else: return None
def process_feeds(self, tuples, user): """ Take tuples from detect_feeds_in_HTML() and create an outline entry for opml """ for t in tuples: html = parser(user.url, convertEntities=parser.HTML_ENTITIES).contents[0] # Ignore feeds for comments if "comment" in t: next # Ignore annoying typo in the html from a friend if "\"" in t: next # Convert relative URLs if "http" in t: xml = parser(t, convertEntities=parser.HTML_ENTITIES).contents[0] else: myxml = html + t xml = parser(myxml, convertEntities=parser.HTML_ENTITIES).contents[0] # If we've got something, rather than nothing, return a hash if xml: return {'title': user.name, 'html_url': html, 'xml_url': xml} else: return None
def detect_feeds_in_HTML(input_stream): """ examines an open text stream with HTML for referenced feeds. This is achieved by detecting all ``link`` tags that reference a feed in HTML. :param input_stream: an arbitrary opened input stream that has a :func:`read` method. :type input_stream: an input stream (e.g. open file or URL) :return: a list of tuples ``(url, feed_type)`` :rtype: ``list(tuple(str, str))`` """ # check if really an input stream if not hasattr(input_stream, "read"): raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream)) result = [] # get the textual data (the HTML) from the input stream html = parser(input_stream.read()) # find all links that have an "alternate" attribute feed_urls = html.findAll("link", rel="alternate") # extract URL and type for feed_link in feed_urls: url = feed_link.get("href", None) # if a valid URL is there if url: result.append(url) return result
def detect_feeds_in_HTML(input_stream): """ examines an open text stream with HTML for referenced feeds. This is achieved by detecting all ``link`` tags that reference a feed in HTML. :param input_stream: an arbitrary opened input stream that has a :func:`read` method. :type input_stream: an input stream (e.g. open file or URL) :return: a list of tuples ``(url, feed_type)`` :rtype: ``list(tuple(str, str))`` """ # check if really an input stream if not hasattr(input_stream, "read"): raise TypeError( "An opened input *stream* should be given, was %s instead!" % type(input_stream)) result = [] # get the textual data (the HTML) from the input stream html = parser(input_stream.read()) # find all links that have an "alternate" attribute feed_urls = html.findAll("link", rel="alternate") # extract URL and type for feed_link in feed_urls: url = feed_link.get("href", None) # if a valid URL is there if url: result.append(url) return result
def find_url(domain, page, text, only_out=True): soup = parser(text) links = soup('a') #print 'links',links links_final = [] #print 'domain',domain page_root = page.replace('http://', '').replace('www', '').split('/')[0] for link in links: try: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # remove location portion if url[0:4] == 'http': linkText = gettextonly(link) if only_out: link_root = link['href'].replace( 'http://', '').replace('www', '').split('/')[0] if link_root != page_root: links_final.append((url_uniformer(url), linkText)) else: links_final.append((url_uniformer(url), linkText)) except: pass return links_final
def request(self, url): if True: self.response = requests.get(self.valid_url) status_code = self.response.status_code else: status_code = None if status_code == 200: self.parsed_html = parser(self.response.text) if not self.folder_created: # Create folder on first request print("Fetching links from server %s" % self.valid_url), self.directory = "./" + self.title + "/" self.create_folder(self.directory) # self.folder_created = True self.crawl_links(primary_links=True) else: self.crawl_links()
def find_url(domain,page,text,only_out=True): soup=parser(text) links=soup('a') #print 'links',links links_final=[] #print 'domain',domain page_root = page.replace('http://','').replace('www','').split('/')[0] for link in links: try: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1: continue url=url.split('#')[0] # remove location portion if url[0:4]=='http': linkText=gettextonly(link) if only_out: link_root=link['href'].replace('http://','').replace('www','').split('/')[0] if link_root!=page_root: links_final.append((url_uniformer(url),linkText)) else: links_final.append((url_uniformer(url),linkText)) except: pass return links_final
def parse_raw_html(self, raw_html): return parser(raw_html)
def beautify(self, url): response_data = self.simple_request(url) # Parsing response data to HTML if response_data: return parser(response_data) else: return None