def get_profile_by_page(link, config): html = pg(requests.get(link).text) profile = {} for k, v in config.iteritems(): if isinstance(v, list): for i in v: result = html(i) if result: break else: result = html(v) if len(result) > 0: result_set = [] for r in result: if r.tag == 'img': result_text = r.attrib['src'] else: result_text = r.text_content() result_text = result_text.strip() if result_text not in result_set: result_set.append(result_text) profile[k] = ','.join(result_set) else: profile[k] = result.text().strip() return profile
def get_links(text): query = SEARCH_QUERY.format(url_quote(text)) #print('Query: ', query) html = pg(requests.get(query).text) links = [a.attrib['href'] for a in html('.r')('a')] result = [] for link in links: # q={url} match_result = RE_LINK.match(link) if not match_result: continue link = match_result.groups()[0] result.append(url_unquote(link)) return result
# 爬网站查询IP #/usr/bin/python # -*- coding: UTF-8 -*- import requests from pyquery import PyQuery as pg ipaddr = raw_input("Enter IP:") content = requests.get('http://www.ip138.com/ips1388.asp?ip=%s&action=2' % ipaddr) print(content.url) doc = pg(content.content) li = doc('li') print li.text()