import random import yaml from lib.log import LogHandler from lib.rabbitmq import Rabbit import json m = Mongo('192.168.0.235') connect = m.connect setting = yaml.load(open('config.yaml')) db_name = setting['CEIC']['mongo']['db'] State_indicators_name = setting['CEIC']['mongo']['State_indicators'] State_indicators_details_name = setting['CEIC']['mongo'][ 'State_indicators_details'] log = LogHandler('ceic_detail') def create_date( indexFrequency, start_year, start_mouth, end_year, ): """ :return: ['from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1',] """ """ 根据开始时间分割年月日 """
import yaml import requests from lib.log import LogHandler from lib.mongo import Mongo from lxml import etree from sql_mysql import inquire, TypeAuction from auction import Auction import re import datetime setting = yaml.load(open('config.yaml')) client = Mongo(host=setting['mongo']['host'], port=setting['mongo']['port']).connect coll = client[setting['mongo']['db']][setting['mongo']['collection']] source = 'jiapai' log = LogHandler(__name__) class Jiapai: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' } self.list_info = [] self.type_list = inquire(TypeAuction, source) def start_crawler(self): for type_ in self.type_list: html_type = type_.html_type auction_type = type_.auction_type url = 'http://www.jiapai.net.cn/index.php/Judicial/index/px/' + type_.code
CO_INDEX : 101 author: 程纪文 """ from crawler_base import Crawler from comm_info import Comm, Building, House from get_page_num import AllListUrl import re, requests from lxml import etree import random import time from lib.log import LogHandler co_index = '101' city = '保定' log = LogHandler('baoding_101') class Baoding(Crawler): def __init__(self): self.start_url = 'http://www.bdfdc.net/loadAllProjects.jspx' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', } def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页', )
city : 武汉 CO_INDEX : 78 author: 程纪文 """ from backup.crawler_base import Crawler from backup.comm_info import Comm, Building, House import re from lxml import etree from urllib import parse import time from lib.log import LogHandler from backup.proxy_connection import Proxy_contact city = '武汉' co_index = '78' log = LogHandler('wuhan_78') class Wuhan(Crawler): def __init__(self): self.start_url = 'http://scxx.fgj.wuhan.gov.cn/xmqk.asp' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', } def start_crawler(self): proxy = Proxy_contact(app_name='wuhan', method='get', url=self.start_url, headers=self.headers)
""" url = http://www.hyfc365.com/RealEstate/RealtyProject/Search.aspx city : 衡阳 CO_INDEX : 181 author: 程纪文 """ from backup.crawler_base import Crawler from backup.comm_info import Building, House import re, requests from lxml import etree from lib.log import LogHandler co_index = '181' city_name = '衡阳' log = LogHandler('衡阳') class Hengyang(Crawler): def __init__(self): self.start_url = 'http://www.hyfc365.com/RealEstate/RealtyProject/Search.aspx' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', } def start_crawler(self): viewstate = "/wEPDwUKLTM2MzMxMTM1Nw8WBB4PSGlkZUNvbnRleHRNZW51CymEAXprU3VwZXJNYXAuV2ViLlVJLnprU3VwZXJNYXBQYWdlU3R5bGUsIHprU3VwZXJNYXAuQ29tbW9uTGlicmFyeSwgVmVyc2lvbj0xLjEuNTAwLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49NzJkNzZkMzJkOGNiYTIyZgIeD0hpZGVTZWxlY3RTdGFydAsrBAIWAgIBD2QWCgIDD2QWAmYPDxYEHghDc3NDbGFzcwUQY3NzQm94VGl0bGVUaHJlZR4EXyFTQgICZBYCAgEPDxYGHgRUZXh0BRLlvIDlj5HkvIHkuJrmn6Xor6IeC05hdmlnYXRlVXJsBSQvUmVhbEVzdGF0ZS9SZWFsdHlEZWFsZXIvU2VhcmNoLmFzcHgeBlRhcmdldGUWAh4MVGV4dENoYW5naW5nBQRUcnVlZAIFD2QWAmYPDxYEHwIFFGNzc0JveFRpdGxlVGhyZWVPdmVyHwMCAmQWAgIBDw8WBh8EBRTmpbznm5go6aG555uuKeafpeivoh8FZR8GZRYCHwcFBFRydWVkAgcPZBYCZg8PFgQfAgUQY3NzQm94VGl0bGVUaHJlZR8DAgJkFgICAQ8PFgYfBAUUKOe9keS4iinmiL/mupDmn6Xor6IfBQUqL1JlYWxFc3RhdGUvUmVhbHR5U2VhcmNoL1NlYXJjaF9Ib3VzZS5hc3B4HwZlFgIfBwUEVHJ1ZWQCCQ9kFgJmDw8WBB8CBRBjc3NCb3hUaXRsZVRocmVlHwMCAmQWAgIBDw8WBh8EBRLlkIjlkIzlpIfmoYjmn6Xor6IfBQUsL1JlYWxFc3RhdGUvUmVhbHR5U2VhcmNoL1NlYXJjaF9SZWNvcmRzLmFzcHgfBmUWAh8HBQRUcnVlZAITDzwrAAsAZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUNQ3VzdG9tUGFnaW5nMbpNuvQVuP+DYqCe1+wbVab+715lNR+eC+hDFTSfvE0y" valid = "/wEWAwKHpppsAqi0zakHArrY8x1xs+nwBroCH5+KiDI9tW1jyttusdquHQRtH5UPs6GOzg==" data = { "CustomPaging1_CurrentPageIndex": -1, "__VIEWSTATE": viewstate,
import requests from lxml import etree from lib.proxy_iterator import Proxies from pymongo import MongoClient import re import aiohttp import asyncio from lib.log import LogHandler import time import pika import json import threading log = LogHandler('xian') p = Proxies() p = p.get_one(proxies_number=7) # p = {'http': 'http://*****:*****@zproxy.lum-superproxy.io:22225'} m = MongoClient(host='114.80.150.196', port=27777, username='******', password='******') crawler_collection = m['hilder_gv']['xian'] class XiAn: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' }
import requests from lxml import etree from lib.proxy_iterator import Proxies from pymongo import MongoClient import re import aiohttp import asyncio from lib.log import LogHandler import time import pika import json log = LogHandler('loupan') p = Proxies() p = p.get_one(proxies_number=7) m = MongoClient(host='114.80.150.196', port=27777, username='******', password='******') crawler_collection = m['fangjia']['district_complete'] class LouPanConsumer: def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } self.connection = pika.BlockingConnection(pika.ConnectionParameters(host='114.80.150.196', port=5673, heartbeat=0)) self.channel = self.connection.channel() self.channel.queue_declare(queue='loupan') def final_parse(self, data): url = data['url']
""" 消费xiaozijia_num队列,请求,入小区库 大约需要1个小时 """ from lib.log import LogHandler from lib.mongo import Mongo from lib.rabbitmq import Rabbit import requests import yaml import json log = LogHandler('小资家_comm') setting = yaml.load(open('config.yaml')) # mongo m = Mongo(setting['xiaozijia']['mongo']['host'], setting['xiaozijia']['mongo']['port'], user_name=setting['xiaozijia']['mongo']['user_name'], password=setting['xiaozijia']['mongo']['password']) coll_comm = m.connect[setting['xiaozijia']['mongo']['db']][ setting['xiaozijia']['mongo']['comm_coll']] # rabbit r = Rabbit(setting['xiaozijia']['rabbit']['host'], setting['xiaozijia']['rabbit']['port']) channel = r.get_channel() queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_num'] build_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_build'] channel.queue_declare(queue=queue)
from deal_price_info import Comm import requests import re from lxml import etree import random from lib.log import LogHandler import time, datetime import json log = LogHandler('centaline') source = '中原地产' class Centaline: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', } self.start_url = 'http://www.centaline.com.cn/' def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) res.encoding = 'gbk' second_city_list = re.findall('http://\w+.centanet.com/ershoufang/', res.text, re.S | re.M) for city in second_city_list: city_comm = city.replace('ershoufang', 'xiaoqu') city_res = requests.get(city_comm, headers=self.headers) city_res.encoding = 'gbk'
import requests import re from deal_price_info import Comm import time, datetime from lib.log import LogHandler log = LogHandler('链家在线') url = 'https://sh.lianjia.com/' class Lianjiazaixian(): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', } def start_crawler(self): response = requests.get(url, headers=self.headers) html = response.text city_list_html = re.search('city-tab".*?</div></div></div>', html, re.S | re.M).group() city_a_html_list = re.findall('<a.*?</a>', city_list_html, re.S | re.M) city_dict = {} for i in city_a_html_list: city = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1) city_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) if 'you' not in city_url and 'fang' not in city_url: city_dict[city] = city_url self.get_city_info(city_dict)
import requests import re from deal_price_info import Comm import time, datetime from lib.log import LogHandler url = 'http://sh.koofang.com/xiaoqu/pg1' log = LogHandler('上海酷房网') class Kufangwang(): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', } def start_crawler(self): self.get_comm_info(url) self.get_all_comm_url(url) def get_comm_info(self, page_url): response = requests.get(page_url, headers=self.headers) html = response.text comm_info_html_list = re.findall('<div class="avail_conr">.*?</li>', html, re.S | re.M) for i in comm_info_html_list: comm = Comm('上海酷房网') comm.city = '上海' comm.district_name = re.search('class="avail_cont".*?>(.*?)<', i,
# from deal_price_info import Comm from BaseClass import Base import requests import re from lxml import etree import time import datetime from lib.log import LogHandler from lib.proxy_iterator import Proxies p = Proxies() source = '房途网' log = LogHandler('房途网') class Fangtu(object): def __init__(self, proxies): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } self.start_url = 'http://hangzhou.fangtoo.com/building/' self.proxies = proxies def start_crawler(self): url = 'http://hangzhou.fangtoo.com/building/cp1/' res = requests.get(url=url, headers=self.headers, proxies=self.proxies) num = re.search('pagecount:(\d+),', res.text, re.S | re.M).group(1) for i in range(1, int(num) + 1): url = self.start_url + "cp" + str(i) + "/" try:
# ret, info = bucket_manager.fetch(url,bucket,filename) # if info.status_code == 200: # # file_url = bucket_domain + "/" + filename # print(file_url) # return file_url # else: # print("{}抓取失败".format(url)) """ 图片爬取 """ proxy = Proxies() bucket = 'fangjia-img' log = LogHandler("qiniu") @retry(delay=2) def qiniufetch(url, file_name): headers = {"user_agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } if 'http' in url: """ 使用代理池 """ # image_download = Proxy_contact(app_name='qiniufetch', method='get', url=url, headers=headers) # con = image_download.contact() # while True: # try:
city : 韶关 CO_INDEX : 194 author: 程纪文 """ from crawler_base import Crawler from comm_info import Comm, Building, House from get_page_num import AllListUrl from producer import ProducerListUrl import re, requests from lxml import etree from lib.log import LogHandler co_index = '194' city_name = '韶关' log = LogHandler('韶关') class Shaoguan(Crawler): def __init__(self): self.start_url = 'http://61.143.241.154/user_kfs.aspx' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Referer': 'http://61.143.241.154/user_itemlist.aspx' } self.proxies = [ { "http": "http://192.168.0.96:3234" }, {