import pika import yaml import requests from lib.proxy_iterator import Proxies from lib.log import LogHandler from lxml import etree import re import json from pymongo import MongoClient log = LogHandler(__name__) p = Proxies() setting = yaml.load(open('config_dianping.yaml')) m = MongoClient(host=setting['mongo']['host'], port=setting['mongo']['port'], username=setting['mongo']['user_name'], password=setting['mongo']['password']) db = m[setting['mongo']['db_name']] dianping_all_type_collection = db[setting['mongo']['shop_detail_collection']] connection = pika.BlockingConnection( pika.ConnectionParameters(host=setting['rabbit']['host'], port=setting['rabbit']['port'])) channel = connection.channel() channel.queue_declare(queue='rpc_queue') class ConvertIdRpcServer(object): def __init__(self, proxies): self.proxies = proxies self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
def __init__(self): self.proxy = Proxies()
from xiaozijia_core.y666yun import GetPhone import requests import re from lib.mongo import Mongo import datetime from lib.proxy_iterator import Proxies proxies = Proxies() proxies = proxies.get_one(proxies_number=1) class Register(object): def __init__(self): self.password = '******' self.s = requests.session() self.g = GetPhone('小资家') self.phone = self.g.phone self.headers = { 'Connection': 'keep-alive', 'Host': 'www.xiaozijia.cn:8002', 'User-Agent': 'xiao zi jiaiOS/1.2.1 (iPhone; iOS 11.4.1; Scale/2.00)', } self.code = '' self.m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******') self.coll = self.m.connect['friends']['xiaozijia_user'] self.result = '' def sent_phone(self):
import requests from lxml import etree from lib.proxy_iterator import Proxies from pymongo import MongoClient import re import threading from lib.log import LogHandler from retry import retry log = LogHandler('lianjia') p = Proxies() p = p.get_one(proxies_number=7) m = MongoClient(host='114.80.150.196', port=27777, username='******', password='******') collection = m['hilder_gv']['sichuan'] sichuan_city_list = [ '成都', '绵阳', '宜宾', '自贡', '攀枝花', '广元', '乐山', '南充', '泸州', '资阳', '内江', '达州', '巴中', '遂宁', '眉山', '德阳', '广安', '雅安', '阿坝州', '甘孜州', '凉山州' ] class Lianjia: def __init__(self): self.headers = { 'Cookie': 'lianjia_uuid=44a258db-4e00-4541-997c-57f4f3c117c1; _smt_uid=5c077f11.54f9c61d; gr_user_id=34c329d5-abde-48c8-8e92-164aeb1967c4; UM_distinctid=1677d485e781e8-08ba54e7ba4e7e-35607402-1fa400-1677d485e7994; _jzqc=1; _ga=GA1.2.130576672.1543995159; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1543995154,1544173828,1544173833; _jzqy=1.1544173829.1544173833.2.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6%E5%9C%B0%E4%BA%A7.jzqsr=baidu; _jzqx=1.1544430132.1544608309.5.jzqsr=bj%2Elianjia%2Ecom|jzqct=/.jzqsr=bj%2Elianjia%2Ecom|jzqct=/chengjiao/fengtai/; _gid=GA1.2.2020321299.1545103818; lianjia_ssid=b653ca99-45ef-4791-adbc-8cc15e705d04; _jzqa=1.4552267029258056000.1543995157.1545189315.1545206059.32; _jzqckmp=1; Qs_lvt_200116=1544798856%2C1545206539; Qs_pv_200116=235986746040596130%2C4405708339866472400%2C1972589321055627500%2C3526812790752574500%2C3163296021085384000; gr_session_id_a1a50f141657a94e=1aed3e59-04fb-4f93-90bc-5637149eeea8; gr_session_id_a1a50f141657a94e_1aed3e59-04fb-4f93-90bc-5637149eeea8=true; select_city=310000; all-lj=dafad6dd721afb903f2a315ab2f72633; TY_SESSION_ID=3a1d7567-ccca-4314-a3df-f1551037dceb; CNZZDATA1253492439=1920645834-1545204530-https%253A%252F%252Fbj.lianjia.com%252F%7C1545204530; CNZZDATA1254525948=828434328-1545203924-https%253A%252F%252Fbj.lianjia.com%252F%7C1545203924; CNZZDATA1255633284=1412891771-1545206158-https%253A%252F%252Fbj.lianjia.com%252F%7C1545206158; CNZZDATA1255604082=774544540-1545204688-https%253A%252F%252Fbj.lianjia.com%252F%7C1545204688; _qzjc=1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1545207895; _qzja=1.386660674.1545207885583.1545207885583.1545207885583.1545207891886.1545207895471.0.0.0.5.1; _qzjb=1.1545207885583.5.0.0.0; _qzjto=5.1.0; _jzqb=1.134.10.1545206059.1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
import requests from lib.mongo import Mongo from lib.log import LogHandler import time import datetime from lib.proxy_iterator import Proxies p = Proxies() P = p.get_one(proxies_number=3) m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******') collection = m.connect['friends']['zhizi_list'] detail_collection = m.connect['friends']['zhizi_detail'] deal_price_collection = m.connect['friends']['zhizi_deal_price_new'] listing_price_collection = m.connect['friends']['zhizi_listing_price'] new_house_collection = m.connect['friends']['zhizi_new_house'] new_house_sales_license_collection = m.connect['friends'][ 'zhizi_new_house_sales_license'] log = LogHandler(__name__) def time_convert(data_): # 时间转换 '1532448000000' return time.strftime("%Y-%m-%d", time.localtime(data_ / 1000.0)) def price_convert(price_): # 价格转换 万元转元 return int(price_) * 10000 headers = {
# _*_ coding:utf-8 _*_ # from company.baidumap_consumer import BaiduMapConsumer from company.baidumap_producer import baiduproducer from lib.proxy_iterator import Proxies from multiprocessing import Process from company.baidumap_consumer_update import BaiduMapConsumer if __name__ == '__main__': # Process(target=baiduproducer).start() p = Proxies() # # Process(target=BaiduMapConsumer(proxies=next(p)).start_consume).start() # for x in range(1,7): Process(target=BaiduMapConsumer(proxies=p.get_one(x)).start_consume).start() # proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { # "host": "http-dyn.abuyun.com", # "port": "9020", # "user": "******", # "pass": "******", # } # proxies = {"https": proxy, # "http": proxy} # Process(target=BaiduMapConsumer(proxies=proxies).start_consume).start()
# auth = Auth(access_key, secret_key) # # 初始化BucketManager # bucket_manager = BucketManager(auth) # ret, info = bucket_manager.fetch(url,bucket,filename) # if info.status_code == 200: # # file_url = bucket_domain + "/" + filename # print(file_url) # return file_url # else: # print("{}抓取失败".format(url)) """ 图片爬取 """ proxy = Proxies() bucket = 'fangjia-img' log = LogHandler("qiniu") @retry(delay=2) def qiniufetch(url, file_name): headers = { "user_agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } if 'http' in url: """ 使用代理池 """
猎聘运行是3,4 """ # from company.liepin_category import get_city,get_category from company.liepin_producer_list import LiepinProduceList from lib.proxy_iterator import Proxies from multiprocessing import Process from company.liepin_consumer_single import LiepinConsumeSingle from company.liepin_producer_detail import LiepinProducerDetail from company.liepin_consumer_gevent import LiepinConsumeGevent if __name__ == '__main__': #1.分别将城市代码及分类代码存入到mysql数据库中 # get_city() # get_category() #2.生产者,将分页也就是列表页链接放入到队列中 p = Proxies() Process(target=LiepinProduceList(proxies=next(p)).start_crawler).start() #3.生产者,消费2中队列的url,解析出来公司的url,将公司详情页放入到队列中 Process(target=LiepinProducerDetail(proxies=next(p)).start_consume).start() #4.消费3中队列中的URL,发请求\解析\入库 p = Proxies() for x in range(1, 7): Process(target=LiepinConsumeSingle(proxies=p.get_one( proxies_number=x)).start_consume).start() proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": "http-dyn.abuyun.com", "port": "9020", "user": "******",
else: if text['status'] == '1': poi_list = text['data']['poi_list'] for poi in poi_list: address = poi['address'] if map_street in address: dict_text = dict(poi) poi_info.append(dict_text) else: break # 注意此处是更新 if len(poi_info) != 31: mongo_collection.update_one( { 'city_code': data['city_code'], 'region': data['region'], 'street_number': data['street_number'] }, {'$set': { 'poi_info': poi_info }}) else: print(res.json()) log.error('请求失败,status不为1,url = {}'.format(res.url)) return True if __name__ == '__main__': p = Proxies() street = AddStreet(proxies=p.get_one(proxies_number=1)) street.add_streets()