def main(): #url = "http://www.coddicted.com" #parser = Parser("java", logging.DEBUG) #fetcher = Fetcher(logging.DEBUG) #text = fetcher.fetch(url) #print text #for link in parser.get_links(url, text): # print link.url linkExtractor = LinkExtractor() linkExtractor.feed("<p>hello</p><p><a class='link' href='#main'><img src='/xyz'/></a><a href='#index'>welcome abcd efgh</a></p>") tags = linkExtractor.get_tags() for tag in tags: print tag.href print tag.content
def get_links(self, url, text): links = [] try: if text is not None: #soup = BeautifulSoup(text) linkExtractor = LinkExtractor() linkExtractor.feed(text) for tag in linkExtractor.get_tags(): original_href = tag.href tag.href = self._clean(urlparse.urljoin(self._clean(url), self._clean(tag.href))) if tag.href and "javascript" not in tag.href.lower(): try : extra_info = self._alpha_num_str(self._clean(original_href) + ' ' + str(self._clean(tag.content))) except: extra_info = self._alpha_num_str(self._clean(original_href)) links.append(Link(self._clean(tag.href),extra_info)) except: print "Content Parsing Error" return links
def parse_maven_response(response): body = response.content.decode('utf-8') return LinkExtractor().extract_hrefs(body)