Esempio n. 1
0
 def get_main_logo(data):
     path, name = data
     urls = etree.HTML(crawlers.get_url('http://www.goodlogo.com' + path)).xpath('//img[@longdesc and @width > 10 and @height > 10]')
     urls = [re.search('(/images/logos/[^/]+\..*)', u.get('src')) for u in urls]
     try:
         url, = [u.groups(1)[0] for u in urls if u]
     except ValueError:
         print(path)
     logo_url = 'http://www.goodlogo.com' + url
     logo = crawlers.get_url(logo_url)
     name = name.decode('ascii', 'ignore')
     try:
         os.makedirs('logos')
     except OSError:
         pickle.dump({'name': name, 'logo_url': logo_url, 'logo': logo}, open('logos/%s.pkl' % name, 'w'))
     try:
         os.makedirs('images')
     except OSError:
         open('images/%s.%s' % (name, logo_url.rsplit('.', 1)[-1]), 'w').write(logo)
Esempio n. 2
0
def crawl_az():
    az = crawlers.get_url('http://www.goodlogo.com/a-z')
    return list(re.findall(r'<a href="(/extended[^"]+)" .*>([^<]+)</a>', az))
 def get_node_and_url(line):
     node = get_node_name(line)
     url = get_url(line)
     return (node, url)