def get_main_logo(data): path, name = data urls = etree.HTML(crawlers.get_url('http://www.goodlogo.com' + path)).xpath('//img[@longdesc and @width > 10 and @height > 10]') urls = [re.search('(/images/logos/[^/]+\..*)', u.get('src')) for u in urls] try: url, = [u.groups(1)[0] for u in urls if u] except ValueError: print(path) logo_url = 'http://www.goodlogo.com' + url logo = crawlers.get_url(logo_url) name = name.decode('ascii', 'ignore') try: os.makedirs('logos') except OSError: pickle.dump({'name': name, 'logo_url': logo_url, 'logo': logo}, open('logos/%s.pkl' % name, 'w')) try: os.makedirs('images') except OSError: open('images/%s.%s' % (name, logo_url.rsplit('.', 1)[-1]), 'w').write(logo)
def crawl_az(): az = crawlers.get_url('http://www.goodlogo.com/a-z') return list(re.findall(r'<a href="(/extended[^"]+)" .*>([^<]+)</a>', az))
def get_node_and_url(line): node = get_node_name(line) url = get_url(line) return (node, url)