Ejemplo n.º 1
0
 def test_get_host(self):
     self.assertEqual(get_host('https://www.google.com/'), 'google')
     self.assertEqual(get_host('http://www.google.com/'), 'google')
     self.assertEqual(get_host('www.google.com'), 'google')
     self.assertEqual(get_host('google.com'), 'google')
     self.assertEqual(get_host('https://www.style-files.com/'), 'style-files')
     self.assertEqual(get_host('http://www.style-files.com/'), 'style-files')
     self.assertEqual(get_host('www.style-files.com/'), 'style-files')
     self.assertEqual(get_host('style-files.com/'), 'style-files')
Ejemplo n.º 2
0
def main():
    description = """scrape selective site contents"""

    p = argparse.ArgumentParser(description=description)
    p.add_argument('url', help='Target URL')
    a = p.parse_args()

    # get host
    host = get_host(a.url)

    # get extractor reference
    e_ref = get_extractor(host)
    if not e_ref:
        logger.info('No parser for %s', host)
        sys.exit(0)

    e = e_ref(a.url)

    e.title()
    e.text()
    e.images()