Esempio n. 1
0
url_for_proxy_test = ' '
# connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'}
connect_dict = {'host': 'localhost', 'user': '******', 'passwd': '', 'charset': 'utf8'}

# db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)
# proxy_list = map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True))
# for p in proxy_list:
#     qu_proxy_test.put(p)

patt_ip = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d])')
proxy_list = []

for table_name in table_name_s.split(','):
    print table_name
    db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)
    if db_server.isTableExist():
        proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port'))

proxy_list_t=list(set(proxy_list))
for p in proxy_list_t:
    qu_proxy_test.put(p)


def original_ip_address():
    t = requests.get('http://httpbin.org/ip').text
    return json.loads(t).get('origin')


original = original_ip_address()

Esempio n. 2
0
sys.setdefaultencoding('utf8')

# config_text
db_name = 'platform_data'
table_name = 'suning'
table_title = 'product_url,catalogue,sub_catalogue,product_title,promotion_desc,origin_price,price,' \
              'product_stars,comment_count,sending_service,other_service,product_params,shop_name,' \
              'shop_href,product_rating,product_rating_avg,serice_rating,service_rating_avg,express_rating,' \
              'express_rating_avg,com_name_tel,crawl_time'
url_start = 'http://www.suning.com/emall/pgv_10052_10051_1_.html'  # start url for crawl,string
connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'}

# script
db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)

if not db_server.isTableExist():
    db_server.createTable(tableTitle=table_title.split(','))


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(url_start, callback=self.step_first)

    @config(age=2 * 24 * 60 * 60)
    def step_first(self, response):
        d = response.doc
        for t in d('.listLeft>dl>dd>span>a').items():
Esempio n. 3
0
db_name = 'address'
table_name_1 = 'ershoufang_58city_baseinfo'
table_name_2 = 'ershoufang_58city_detail'
table_title_1 = 'detail,crawl_time'
table_title_2 = 'url,detail,crawl_time'
url_start = 'http://www.58.com/ershoufang/changecity/'
# connect string , usually no need to modify
connect_dict = {'host': '10.118.187.12', 'user': '******',
                'passwd': 'admin', 'charset': 'utf8'}

db_server_1 = DBService(dbName=db_name, tableName=table_name_1, **connect_dict)
db_server_2 = DBService(dbName=db_name, tableName=table_name_2, **connect_dict)

# if create table for store result in mysql , no need to be changed
if not db_server_1.isTableExist():
    db_server_1.createTable(tableTitle=table_title_1.split(','))

if not db_server_2.isTableExist():
    db_server_2.createTable(tableTitle=table_title_2.split(','))

pat_num = re.compile('\d+')
pat_replace_space = re.compile('\s+?')
pat_comment = re.compile('var arr=(.+?)\;')


class Handler(BaseHandler):
    crawl_config = {
        # 'proxy': '10.10.10.10:80',
        'headers': {
            'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) '