Beispiel #1
0
def get_profile_by_page(link, config):
    html = pg(requests.get(link).text)
    profile = {}

    for k, v in config.iteritems():
        if isinstance(v, list):
            for i in v:
                result = html(i)
                if result:
                    break
        else:
            result = html(v)

        if len(result) > 0:
            result_set = []
            for r in result:
                if r.tag == 'img':
                    result_text = r.attrib['src']
                else:
                    result_text = r.text_content()
                result_text = result_text.strip()
                if result_text not in result_set:
                    result_set.append(result_text)
            profile[k] = ','.join(result_set)
        else:
            profile[k] = result.text().strip()

    return profile
Beispiel #2
0
def get_links(text):
    query = SEARCH_QUERY.format(url_quote(text))
    #print('Query: ', query)

    html = pg(requests.get(query).text)
    links = [a.attrib['href'] for a in html('.r')('a')]
    result = []

    for link in links:
        # q={url}
        match_result = RE_LINK.match(link)
        if not match_result:
            continue
        link = match_result.groups()[0]
        result.append(url_unquote(link))

    return result
Beispiel #3
0
# 爬网站查询IP
#/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from pyquery import PyQuery as pg

ipaddr = raw_input("Enter IP:")
content = requests.get('http://www.ip138.com/ips1388.asp?ip=%s&action=2' %
                       ipaddr)
print(content.url)
doc = pg(content.content)
li = doc('li')
print li.text()