Beispiel #1
0
def main():
    #url = "http://www.coddicted.com"
    #parser = Parser("java", logging.DEBUG)
    #fetcher = Fetcher(logging.DEBUG)
    #text = fetcher.fetch(url)
    #print text
    #for link in parser.get_links(url, text):
    #    print link.url
    linkExtractor = LinkExtractor()
    linkExtractor.feed("<p>hello</p><p><a class='link' href='#main'><img src='/xyz'/></a><a href='#index'>welcome abcd efgh</a></p>")
    tags =  linkExtractor.get_tags()
    for tag in tags:
        print tag.href
        print tag.content
Beispiel #2
0
 def get_links(self, url, text):
     links = []
     try:
         if text is not None:
             #soup = BeautifulSoup(text)
             linkExtractor = LinkExtractor()
             linkExtractor.feed(text)
             for tag in linkExtractor.get_tags():
                 original_href = tag.href
                 tag.href = self._clean(urlparse.urljoin(self._clean(url), self._clean(tag.href)))
                 if tag.href and "javascript" not in tag.href.lower():
                     try :
                         extra_info = self._alpha_num_str(self._clean(original_href) + ' ' + str(self._clean(tag.content)))
                     except:
                         extra_info = self._alpha_num_str(self._clean(original_href))
                     links.append(Link(self._clean(tag.href),extra_info))
     except:
         print "Content Parsing Error"
     return links
def parse_maven_response(response):
    body = response.content.decode('utf-8')
    return LinkExtractor().extract_hrefs(body)